<a href="https://colab.research.google.com/github/BChun11/DATA3001/blob/main/DATA3001_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [2]:
# Download all the files into google colab environment
!git clone https://github.com/nokuik/KDDI-IoT-2019.git

Cloning into 'KDDI-IoT-2019'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 42 (delta 1), reused 9 (delta 1), pack-reused 33[K
Receiving objects: 100% (42/42), 776.84 MiB | 21.38 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Updating files: 100% (31/31), done.


In [3]:
# List all '.tar.gz files in ipfix directory
%cd KDDI-IoT-2019
%cd ipfix
!ls *.tar.gz

/content/KDDI-IoT-2019
/content/KDDI-IoT-2019/ipfix
 amazon_echo_gen2.tar.gz		        nature_remo.tar.gz
 au_network_camera.tar.gz		        panasonic_doorphone.tar.gz
 au_wireless_adapter.tar.gz		        philips_hue_bridge.tar.gz
 bitfinder_awair_breathe_easy.tar.gz	       'planex_camera_one_shot!.tar.gz'
 candy_house_sesami_wi-fi_access_point.tar.gz   planex_smacam_outdoor.tar.gz
 irobot_roomba.tar.gz			        planex_smacam_pantilt.tar.gz
 jvc_kenwood_cu-hb1.tar.gz		        powerelectric_wi-fi_plug.tar.gz
 jvc_kenwood_hdtv_ip_camera.tar.gz	        qrio_hub.tar.gz
 line_clova_wave.tar.gz			        sony_network_camera.tar.gz
 link_japan_eremote.tar.gz		        sony_smart_speaker.tar.gz
 mouse_computer_room_hub.tar.gz		        xiaomi_mijia_led.tar.gz


In [4]:
# List all tar.gz files and store them in a variable
files = !ls -1 *.tar.gz

# Extract each tar.gz file
for file in files:
    print(f"Extract {file} ")
    !tar -xzvf {file}

Extract amazon_echo_gen2.tar.gz 
amazon_echo_gen2.json
Extract au_network_camera.tar.gz 
au_network_camera.json
Extract au_wireless_adapter.tar.gz 
au_wireless_adapter.json
Extract bitfinder_awair_breathe_easy.tar.gz 
bitfinder_awair_breathe_easy.json
Extract candy_house_sesami_wi-fi_access_point.tar.gz 
candy_house_sesami_wi-fi_access_point.json
Extract irobot_roomba.tar.gz 
irobot_roomba.json
Extract jvc_kenwood_cu-hb1.tar.gz 
jvc_kenwood_cu-hb1.json
Extract jvc_kenwood_hdtv_ip_camera.tar.gz 
jvc_kenwood_hdtv_ip_camera.json
Extract line_clova_wave.tar.gz 
line_clova_wave.json
Extract link_japan_eremote.tar.gz 
link_japan_eremote.json
Extract mouse_computer_room_hub.tar.gz 
mouse_computer_room_hub.json
Extract nature_remo.tar.gz 
nature_remo.json
Extract panasonic_doorphone.tar.gz 
panasonic_doorphone.json
Extract philips_hue_bridge.tar.gz 
philips_hue_bridge.json
Extract 'planex_camera_one_shot!.tar.gz' 
planex_camera_one_shot!.json
Extract planex_smacam_outdoor.tar.gz 
planex_smacam

In [5]:
# Print the current working directory
print("Current Working Directory:", os.getcwd())

# List the contents of the current working directory
print("Contents of Current Directory:", os.listdir())

!cd

Current Working Directory: /content/KDDI-IoT-2019/ipfix
Contents of Current Directory: ['nature_remo.tar.gz', 'panasonic_doorphone.tar.gz', 'philips_hue_bridge.tar.gz', 'sony_network_camera.tar.gz', 'irobot_roomba.json', 'nature_remo.json', 'qrio_hub.json', 'powerelectric_wi-fi_plug.tar.gz', 'planex_camera_one_shot!.json', 'panasonic_doorphone.json', 'mouse_computer_room_hub.json', 'sony_smart_speaker.json', 'amazon_echo_gen2.json', 'au_wireless_adapter.json', 'bitfinder_awair_breathe_easy.json', 'link_japan_eremote.json', 'mouse_computer_room_hub.tar.gz', 'sony_network_camera.json', 'jvc_kenwood_cu-hb1.tar.gz', 'planex_smacam_outdoor.tar.gz', 'au_wireless_adapter.tar.gz', 'line_clova_wave.json', 'sony_bravia.tar.gz02', 'sony_bravia.tar.gz00', 'i-o_data_qwatch.tar.gz00', 'google_home_gen1.tar.gz00', 'i-o_data_qwatch.tar.gz01', 'xiaomi_mijia_led.tar.gz', 'candy_house_sesami_wi-fi_access_point.json', 'irobot_roomba.tar.gz', 'planex_camera_one_shot!.tar.gz', 'xiaomi_mijia_led.json', 'sony

In [6]:
# Code to generate distinct tables for each json file using a limited subset

# Define the directory where the JSON files are located
json_directory = '/content/KDDI-IoT-2019/ipfix'

# Get the list of all JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Create distinct tables for each json file
tables = {}
for json_file in json_files:
    device_name = json_file.split('.')[0]
    json_path = os.path.join(json_directory, json_file)

    # Read the JSON file into a DataFrame, normalize the 'flows' column
    df = pd.json_normalize(pd.read_json(json_path, lines=True, nrows=20000)['flows'])

    # Label the DataFrame with the device name
    df['Device'] = device_name
    tables[device_name] = df

# Concatenate all the Dataframes in the tables dictionary into a single Dataframe
df = pd.concat(tables.values(), ignore_index=True)
df.head()

Unnamed: 0,flowStartMilliseconds,flowEndMilliseconds,flowDurationMilliseconds,reverseFlowDeltaMilliseconds,protocolIdentifier,sourceIPv4Address,sourceTransportPort,packetTotalCount,octetTotalCount,flowAttributes,...,reverseNonEmptyPacketCount,reverseDataByteCount,reverseAverageInterarrivalTime,reverseFirstNonEmptyPacketSize,reverseLargePacketCount,reverseMaxPacketSize,reverseStandardDeviationPayloadLength,reverseStandardDeviationInterarrivalTime,reverseBytesPerPacket,Device
0,2019-06-25 08:37:06.496,2019-06-25 08:37:07.076,0.58,0.0,17.0,192.168.1.1,43644.0,2.0,156.0,1,...,,,,,,,,,,irobot_roomba
1,2019-06-25 08:38:09.246,2019-06-25 08:38:09.800,0.554,0.0,17.0,192.168.1.1,47370.0,2.0,156.0,1,...,,,,,,,,,,irobot_roomba
2,2019-06-25 08:39:11.789,2019-06-25 08:39:12.344,0.555,0.0,17.0,192.168.1.1,34691.0,2.0,156.0,1,...,,,,,,,,,,irobot_roomba
3,2019-06-25 08:40:14.458,2019-06-25 08:40:15.037,0.579,0.0,17.0,192.168.1.1,46756.0,2.0,156.0,1,...,,,,,,,,,,irobot_roomba
4,2019-06-25 08:41:17.323,2019-06-25 08:41:17.879,0.556,0.0,17.0,192.168.1.1,41733.0,2.0,156.0,1,...,,,,,,,,,,irobot_roomba


In [7]:

def get_minimal_dtype(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2
    print("Memory usage became: ",mem_usg," MB")

    return df

In [8]:
df = get_minimal_dtype(df)

Memory usage of dataframe is 183.34 MB
Memory usage became:  92.7572078704834  MB


#### Discarding certain attributes
The primary goal of training our models is to focus on attributes that provide valuable and distinguishable information about the data

In [9]:
# Define the list of columns to be dropped
drop_columns = ['flowStartMilliseconds',
                'flowEndMilliseconds',
                'sourceMacAddress',
                'destinationMacAddress'
]

# Drop the columns from the dataset
df = df.drop(columns=drop_columns)

In [10]:
"""
  Encode and transform categorical values since Decision Tree & Random Forest can't handle non-numeric
  categorical data directly
"""
# Frequency encoding for IP addresses
def frequency_encoding(df, col_name):
    # Count the frequency of each unique IP address
    freq_map = df[col_name].value_counts().to_dict()
    df[col_name] = df[col_name].map(freq_map)
    return df

df = frequency_encoding(df, 'sourceIPv4Address')
df = frequency_encoding(df, 'destinationIPv4Address')

# Convert hex to int
for col in ['tcpSequenceNumber', 'reverseTcpSequenceNumber', 'vlanId', 'ipClassOfService']:
    df[col] = df[col].apply(lambda x: int(x, 16))

# One hot encoding for categorical attributes (For attributes without ordinal relationships)
one_hot_cols = ['flowAttributes', 'initialTCPFlags', 'unionTCPFlags', 'reverseInitialTCPFlags',
                'reverseUnionTCPFlags', 'reverseFlowAttributes', 'collectorName', 'flowEndReason']
df = pd.get_dummies(df, columns=one_hot_cols)

# Label encoding for attributes that can be encoded without any issues
label_cols = ['firstEightNonEmptyPacketDirections', 'Device']
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [11]:
# Separate the 'Device' column as label
y = df['Device']

# Drop the 'Device' column from the original DataFrame
X = df.drop(columns=['Device'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


#### Assessing the Performance of the Multi-Class Classifier
- The print_score function is designmed to evaluate a multi-class classifier's performance using F1 score and ROC-AUC curve.
- We're using the 'weighted' average for the F1 score and 'ovr' (One-vs-rest) approach for the multi-class ROC-AUC.

In [12]:
# Evaluate how well a classifier model is performing on the training data
def print_score(cls, X_train, y_train, X_test, y_test, train=True):
    if train:
        X, y = X_train, y_train
        data_type = "Train"
    else:
        X, y = X_test, y_test
        data_type = "Test"

    # The model 'cls' uses the features X to make predictions
    pred_y = cls.predict(X)
    prob = cls.predict_proba(X)

    # Compute 'weighted' F1-Score
    f1Score = f1_score(y, pred_y, average='weighted')
    # Compute ROC-AUC for each class using 'One-vs-rest'
    roc_auc = roc_auc_score(y, prob, average='weighted', multi_class='ovr')

    # Print results
    print(f"=== {data_type} Data ===")
    print(f"Weighted F1 Score = {f1Score:.4f}\n")
    print(f'ROC-AUC score: {roc_auc:.4f}\n')
    print("_______________________________________________")
    print(classification_report(y, pred_y))
    print("\n")

### Dealing with Missing Values in our Dataset

We will use the Decision Tree Classifier to evaluate which method for treating missing values is optimal

In [13]:

def evaluate_rf_on_missing_data_method(X_train, X_test, y_train, y_test, method="drop"):
    # Handle missing values based on the specified method
    if method == "drop":
        # Drop columns with missing values
        missing_col = [col for col in X_train.columns if X_train[col].isnull().any()]
        X_train = X_train.drop(missing_col, axis=1)
        X_test = X_test.drop(missing_col, axis=1)

    elif method == "impute":
        # Using imputation to fill in the missing values
        imputer = SimpleImputer()
        X_train = pd.DataFrame(imputer.fit_transform(X_train))
        X_test = pd.DataFrame(imputer.transform(X_test))

    elif method == "impute_indicator":
        # Make new columns indicating what will be imputed
        for col in X_train.columns:
            if X_train[col].isnull().any():
                X_train[col + '_was_missing'] = X_train[col].isnull()
                X_test[col + '_was_missing'] = X_test[col].isnull()
        imputer = SimpleImputer()
        X_train = pd.DataFrame(imputer.fit_transform(X_train))
        X_test = pd.DataFrame(imputer.transform(X_test))

    # Train and evaluate RandomForest classifier
    rf = RandomForestClassifier(random_state=42)
    rf.fit(X_train, y_train)
    print_score(rf, X_train, y_train, X_test, y_test, train=True)
    print_score(rf, X_train, y_train, X_test, y_test, train=False)

# Evaluating the methods
print("Method 1: Dropping columns with Missing values")
evaluate_rf_on_missing_data_method(X_train, X_test, y_train, y_test, method="drop")

print("\nMethod 2: Imputation")
evaluate_rf_on_missing_data_method(X_train, X_test, y_train, y_test, method="impute")

print("\nMethod 3: Imputation with Missing Indicators")
evaluate_rf_on_missing_data_method(X_train, X_test, y_train, y_test, method="impute_indicator")


Method 1: Dropping columns with Missing values
=== Train Data ===
Weighted F1 Score = 0.2814

ROC-AUC score: 0.7816

_______________________________________________
              precision    recall  f1-score   support

           0       0.99      0.20      0.33     18043
           1       0.49      0.09      0.16     18020
           2       0.97      0.64      0.77     17968
           3       0.98      0.08      0.14     18037
           4       0.08      0.99      0.15     17948
           5       0.95      0.04      0.07     18072
           6       0.86      0.36      0.51     17998
           7       0.85      0.39      0.53     17963
           8       0.95      0.03      0.06     18007
           9       0.75      0.00      0.00     17978
          10       0.94      0.96      0.95     15231
          11       0.95      0.04      0.07     18032
          12       0.91      0.01      0.01     17969
          13       0.89      0.05      0.10     17972
          14       0.54 

# Exploring the Data

## New Section

In [14]:
X_train.columns

Index(['flowDurationMilliseconds', 'reverseFlowDeltaMilliseconds',
       'protocolIdentifier', 'sourceIPv4Address', 'sourceTransportPort',
       'packetTotalCount', 'octetTotalCount', 'destinationIPv4Address',
       'destinationTransportPort', 'reversePacketTotalCount',
       ...
       'reverseSmallPacketCount_was_missing',
       'reverseNonEmptyPacketCount_was_missing',
       'reverseDataByteCount_was_missing',
       'reverseAverageInterarrivalTime_was_missing',
       'reverseFirstNonEmptyPacketSize_was_missing',
       'reverseLargePacketCount_was_missing',
       'reverseMaxPacketSize_was_missing',
       'reverseStandardDeviationPayloadLength_was_missing',
       'reverseStandardDeviationInterarrivalTime_was_missing',
       'reverseBytesPerPacket_was_missing'],
      dtype='object', length=145)

In [15]:
X_train['ipClassOfService']

396058    0
361613    0
312833    0
44564     0
58257     0
         ..
259178    0
365838    0
131932    0
146867    0
121958    0
Name: ipClassOfService, Length: 393240, dtype: category
Categories (5, int64): [0, 16, 184, 192, 216]