<a href="https://colab.research.google.com/github/BChun11/DATA3001/blob/main/DATA3001_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [3]:
# Download all the files into google colab environment
!git clone https://github.com/nokuik/KDDI-IoT-2019.git

Cloning into 'KDDI-IoT-2019'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 42 (delta 1), reused 9 (delta 1), pack-reused 33[K
Receiving objects: 100% (42/42), 776.84 MiB | 26.14 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Updating files: 100% (31/31), done.


In [4]:
# List all '.tar.gz files in ipfix directory
%cd KDDI-IoT-2019
%cd ipfix
!ls *.tar.gz

/content/KDDI-IoT-2019
/content/KDDI-IoT-2019/ipfix
 amazon_echo_gen2.tar.gz		        nature_remo.tar.gz
 au_network_camera.tar.gz		        panasonic_doorphone.tar.gz
 au_wireless_adapter.tar.gz		        philips_hue_bridge.tar.gz
 bitfinder_awair_breathe_easy.tar.gz	       'planex_camera_one_shot!.tar.gz'
 candy_house_sesami_wi-fi_access_point.tar.gz   planex_smacam_outdoor.tar.gz
 irobot_roomba.tar.gz			        planex_smacam_pantilt.tar.gz
 jvc_kenwood_cu-hb1.tar.gz		        powerelectric_wi-fi_plug.tar.gz
 jvc_kenwood_hdtv_ip_camera.tar.gz	        qrio_hub.tar.gz
 line_clova_wave.tar.gz			        sony_network_camera.tar.gz
 link_japan_eremote.tar.gz		        sony_smart_speaker.tar.gz
 mouse_computer_room_hub.tar.gz		        xiaomi_mijia_led.tar.gz


In [5]:
# List all tar.gz files and store them in a variable
files = !ls -1 *.tar.gz

# Extract each tar.gz file
for file in files:
    print(f"Extract {file} ")
    !tar -xzvf {file}

Extract amazon_echo_gen2.tar.gz 
amazon_echo_gen2.json
Extract au_network_camera.tar.gz 
au_network_camera.json
Extract au_wireless_adapter.tar.gz 
au_wireless_adapter.json
Extract bitfinder_awair_breathe_easy.tar.gz 
bitfinder_awair_breathe_easy.json
Extract candy_house_sesami_wi-fi_access_point.tar.gz 
candy_house_sesami_wi-fi_access_point.json
Extract irobot_roomba.tar.gz 
irobot_roomba.json
Extract jvc_kenwood_cu-hb1.tar.gz 
jvc_kenwood_cu-hb1.json
Extract jvc_kenwood_hdtv_ip_camera.tar.gz 
jvc_kenwood_hdtv_ip_camera.json
Extract line_clova_wave.tar.gz 
line_clova_wave.json
Extract link_japan_eremote.tar.gz 
link_japan_eremote.json
Extract mouse_computer_room_hub.tar.gz 
mouse_computer_room_hub.json
Extract nature_remo.tar.gz 
nature_remo.json
Extract panasonic_doorphone.tar.gz 
panasonic_doorphone.json
Extract philips_hue_bridge.tar.gz 
philips_hue_bridge.json
Extract 'planex_camera_one_shot!.tar.gz' 
planex_camera_one_shot!.json
Extract planex_smacam_outdoor.tar.gz 
planex_smacam

In [6]:
# Print the current working directory
print("Current Working Directory:", os.getcwd())

# List the contents of the current working directory
print("Contents of Current Directory:", os.listdir())

!cd

Current Working Directory: /content/KDDI-IoT-2019/ipfix
Contents of Current Directory: ['sony_bravia.tar.gz02', 'planex_smacam_pantilt.json', 'jvc_kenwood_hdtv_ip_camera.json', 'panasonic_doorphone.tar.gz', 'google_home_gen1.tar.gz00', 'xiaomi_mijia_led.tar.gz', 'jvc_kenwood_cu-hb1.json', 'nature_remo.tar.gz', 'i-o_data_qwatch.tar.gz00', 'mouse_computer_room_hub.json', 'candy_house_sesami_wi-fi_access_point.tar.gz', 'line_clova_wave.tar.gz', 'google_home_gen1.tar.gz01', 'sony_smart_speaker.tar.gz', 'planex_camera_one_shot!.json', 'sony_bravia.tar.gz01', 'jvc_kenwood_cu-hb1.tar.gz', 'powerelectric_wi-fi_plug.json', 'jvc_kenwood_hdtv_ip_camera.tar.gz', 'nature_remo.json', 'line_clova_wave.json', 'link_japan_eremote.tar.gz', 'planex_smacam_pantilt.tar.gz', 'philips_hue_bridge.tar.gz', 'sony_network_camera.tar.gz', 'planex_camera_one_shot!.tar.gz', 'panasonic_doorphone.json', 'au_wireless_adapter.tar.gz', 'bitfinder_awair_breathe_easy.tar.gz', 'sony_smart_speaker.json', 'au_wireless_adapte

In [27]:
# Code to generate distinct tables for each json file using a limited subset

# Define the directory where the JSON files are located
json_directory = '/content/KDDI-IoT-2019/ipfix'

# Get the list of all JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Create distinct tables for each json file
tables = {}
for json_file in json_files:
    # strip .json suffix from device names
    device_name = json_file.split('.')[0]
    # Construct the full path to the JSON file
    json_path = os.path.join(json_directory, json_file)
    # Read the JSON file into a DataFrame, normalize the 'flows' column, and get the first 1000 rows
    df = pd.json_normalize(pd.read_json(json_path, lines=True, nrows=1000)['flows'])

    # Label the DataFrame with the device name
    df['Device'] = device_name
    tables[device_name] = df

# Concatenate all the Dataframes in the tables dictionary into a single Dataframe
df = pd.concat(tables.values(), ignore_index=True)
df.head()

Unnamed: 0,flowStartMilliseconds,flowEndMilliseconds,flowDurationMilliseconds,reverseFlowDeltaMilliseconds,protocolIdentifier,sourceIPv4Address,sourceTransportPort,packetTotalCount,octetTotalCount,flowAttributes,...,reverseNonEmptyPacketCount,reverseDataByteCount,reverseAverageInterarrivalTime,reverseFirstNonEmptyPacketSize,reverseLargePacketCount,reverseMaxPacketSize,reverseStandardDeviationPayloadLength,reverseStandardDeviationInterarrivalTime,reverseBytesPerPacket,Device
0,2019-06-25 13:45:25.490,2019-06-25 13:45:25.490,0.0,0.0,17,192.168.1.17,59758,1,652,0,...,,,,,,,,,,planex_smacam_pantilt
1,2019-06-26 01:45:25.668,2019-06-26 01:45:25.668,0.0,0.0,17,192.168.1.17,42716,1,652,0,...,,,,,,,,,,planex_smacam_pantilt
2,2019-06-26 13:45:28.302,2019-06-26 13:45:28.302,0.0,0.0,17,192.168.1.17,58241,1,652,0,...,,,,,,,,,,planex_smacam_pantilt
3,2019-06-27 01:45:29.198,2019-06-27 01:45:29.198,0.0,0.0,17,192.168.1.17,40729,1,652,0,...,,,,,,,,,,planex_smacam_pantilt
4,2019-06-27 13:45:30.841,2019-06-27 13:45:30.841,0.0,0.0,17,192.168.1.17,52224,1,652,0,...,,,,,,,,,,planex_smacam_pantilt


In [7]:
"""# Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN

 Reduce Memory Usage
  This code is used to reduce memory usage of our dataframe useful when running
  large datasets by reducing the size of the properties of the dataset by selecting
  smaller datatypes


def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2
    print("Memory usage became: ",mem_usg," MB")

    return df"""

In [None]:
# Apply the reduce memory usage function to our dataset
#df = reduce_memory_usage(df)
#df.info()

Memory usage of dataframe is 9.23 MB
Memory usage became:  3.813507080078125  MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 55 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   flowStartMilliseconds                     22000 non-null  category
 1   flowEndMilliseconds                       22000 non-null  category
 2   flowDurationMilliseconds                  22000 non-null  float16 
 3   reverseFlowDeltaMilliseconds              22000 non-null  float16 
 4   protocolIdentifier                        22000 non-null  int8    
 5   sourceIPv4Address                         22000 non-null  category
 6   sourceTransportPort                       22000 non-null  int32   
 7   packetTotalCount                          22000 non-null  int16   
 8   octetTotalCount                           22000 non-null  int32   
 9   flowAttribute

### Data Transformation

In [28]:
# Define the list of columns to be dropped
drop_columns = ['flowStartMilliseconds',
                'flowEndMilliseconds',
                'sourceMacAddress',
                'destinationMacAddress'
]

# Drop the columns from the dataset
df = df.drop(columns=drop_columns)

In [29]:
"""
  Encode and transform categorical values since Decision Tree & Random Forest can't handle non-numeric
  categorical data directly
"""
# Integer encoding for the IP Addresses
def ip_to_int(ip_str):
    # If IPv4 address
    if '.' in ip_str:
        return int(''.join(ip_str.split('.')))
    # If IPv6 address
    elif ':' in ip_str:
        return int(hashlib.sha256(ip_str.encode('utf-8')).hexdigest(), 16) % 10**8

df['sourceIPv4Address'] = df['sourceIPv4Address'].apply(ip_to_int)
df['destinationIPv4Address'] = df['destinationIPv4Address'].apply(ip_to_int)

# Convert hex to int
for col in ['tcpSequenceNumber', 'reverseTcpSequenceNumber', 'vlanId', 'ipClassOfService']:
    df[col] = df[col].apply(lambda x: int(x, 16))

# Label Encoding for other categorical attributes
label_encoders = {}
for col in ['flowAttributes', 'initialTCPFlags', 'unionTCPFlags', 'reverseInitialTCPFlags', 'reverseUnionTCPFlags', 'reverseFlowAttributes', 'collectorName', 'flowEndReason', 'firstEightNonEmptyPacketDirections', 'Device']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 51 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   flowDurationMilliseconds                  22000 non-null  float64
 1   reverseFlowDeltaMilliseconds              22000 non-null  float64
 2   protocolIdentifier                        22000 non-null  int64  
 3   sourceIPv4Address                         22000 non-null  int64  
 4   sourceTransportPort                       22000 non-null  int64  
 5   packetTotalCount                          22000 non-null  int64  
 6   octetTotalCount                           22000 non-null  int64  
 7   flowAttributes                            22000 non-null  int64  
 8   destinationIPv4Address                    22000 non-null  int64  
 9   destinationTransportPort                  22000 non-null  int64  
 10  reversePacketTotalCount           

In [31]:
# Separate the Device column as label
labels_df = df['Device'].copy()

# Drop the Device column from the original DataFrame
df = df.drop(columns=['Device'])

X = df
y = labels_df

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Treating missing values

In [50]:
"""
  This function trains a RandomForestRegressor on the given data and then
  predicts on the test data. It computes the MAE (Mean Absolute Error) between
  the predicted values and the true validation target values
"""
def mae_score(X_train, X_test, y_train, y_test):
  rfr_model = RandomForestRegressor(n_estimators=10, random_state=0)
  rfr_model.fit(X_train, y_train)
  pred = rfr_model.predict(X_test)

  # Compute MAE between the predictions and true test value
  return mean_absolute_error(y_test, pred)


In [46]:
# Using imputation to fill in the missing values

# Create impute object
imputer = SimpleImputer()

# Fit imputer on training data
imputer.fit(X_train)

# Impute training data
Xtrain = imputer.transform(X_train)
# Convert numpy arrays back to pandas dataframes
#Xtrain = pd.DataFrame(Xtrain, columns=X_train.columns)

# Impute the test data
Xtest = imputer.transform(X_test)
#X_test = pd.DataFrame(Xtest, columns=X.columns)




## Applying the Decision Tree and Random Forest Algorithm

#### Assessing the Performance of Classifier

In [51]:
def print_score(cls, X_train, y_train, X_test, y_test, train=True):
    # Check whether we are using the train or test data for evaluation
    if train:
        X, y, data_type = X_train, y_train, "Train"
    else:
        X, y, data_type = X_test, y_test, "Test"

    # The model 'cls' uses the features X to make predictions
    y_pred = cls.predict(X)
    prob = cls.predict_proba(X)

    roc_auc = roc_auc_score(y, prob, average='weighted', multi_class='ovr')
    f1Score = f1_score(y, y_pred, average='weighted')

    # Print results
    print(f'ROC-AUC score: {roc_auc}')
    print("_______________________________________________")
    print(f"Weighted F1 Score = {f1Score}\n")
    print("_______________________________________________")
    print(classification_report(y, y_pred))

#### Decision Tree Classifier

In [53]:
# Create Decision Tree Classifier object
dec_tree = DecisionTreeClassifier(random_state=42)
# Fit the model to the training data
dec_tree.fit(X_train, y_train)

# Print model performance on the training data
print_score(dec_tree, X_train, y_train, X_test, y_test)


ROC-AUC score: 0.9999985193343945
_______________________________________________
Weighted F1 Score = 0.9974239747394912

_______________________________________________
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       912
           1       0.99      1.00      0.99       906
           2       1.00      1.00      1.00       895
           3       1.00      1.00      1.00       893
           4       1.00      1.00      1.00       894
           5       0.99      1.00      0.99       915
           6       1.00      1.00      1.00       895
           7       1.00      1.00      1.00       914
           8       0.99      1.00      0.99       897
           9       0.99      0.99      0.99       895
          10       1.00      1.00      1.00       879
          11       1.00      1.00      1.00       899
          12       1.00      1.00      1.00       889
          13       1.00      1.00      1.00       897
          14       