<a href="https://colab.research.google.com/github/BChun11/DATA3001/blob/main/DATA3001_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import hashlib
from scipy.stats import randint

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, roc_auc_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
# Download all the files into google colab environment
!git clone https://github.com/nokuik/KDDI-IoT-2019.git

Cloning into 'KDDI-IoT-2019'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 42 (delta 1), reused 9 (delta 1), pack-reused 33[K
Receiving objects: 100% (42/42), 776.84 MiB | 21.34 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Updating files: 100% (31/31), done.


In [3]:
# List all '.tar.gz files in ipfix directory
%cd KDDI-IoT-2019
%cd ipfix
!ls *.tar.gz

/content/KDDI-IoT-2019
/content/KDDI-IoT-2019/ipfix
 amazon_echo_gen2.tar.gz		        nature_remo.tar.gz
 au_network_camera.tar.gz		        panasonic_doorphone.tar.gz
 au_wireless_adapter.tar.gz		        philips_hue_bridge.tar.gz
 bitfinder_awair_breathe_easy.tar.gz	       'planex_camera_one_shot!.tar.gz'
 candy_house_sesami_wi-fi_access_point.tar.gz   planex_smacam_outdoor.tar.gz
 irobot_roomba.tar.gz			        planex_smacam_pantilt.tar.gz
 jvc_kenwood_cu-hb1.tar.gz		        powerelectric_wi-fi_plug.tar.gz
 jvc_kenwood_hdtv_ip_camera.tar.gz	        qrio_hub.tar.gz
 line_clova_wave.tar.gz			        sony_network_camera.tar.gz
 link_japan_eremote.tar.gz		        sony_smart_speaker.tar.gz
 mouse_computer_room_hub.tar.gz		        xiaomi_mijia_led.tar.gz


In [4]:
# List all tar.gz files and store them in a variable
files = !ls -1 *.tar.gz

# Extract each tar.gz file
for file in files:
    print(f"Extract {file} ")
    !tar -xzvf {file}

Extract amazon_echo_gen2.tar.gz 
amazon_echo_gen2.json
Extract au_network_camera.tar.gz 
au_network_camera.json
Extract au_wireless_adapter.tar.gz 
au_wireless_adapter.json
Extract bitfinder_awair_breathe_easy.tar.gz 
bitfinder_awair_breathe_easy.json
Extract candy_house_sesami_wi-fi_access_point.tar.gz 
candy_house_sesami_wi-fi_access_point.json
Extract irobot_roomba.tar.gz 
irobot_roomba.json
Extract jvc_kenwood_cu-hb1.tar.gz 
jvc_kenwood_cu-hb1.json
Extract jvc_kenwood_hdtv_ip_camera.tar.gz 
jvc_kenwood_hdtv_ip_camera.json
Extract line_clova_wave.tar.gz 
line_clova_wave.json
Extract link_japan_eremote.tar.gz 
link_japan_eremote.json
Extract mouse_computer_room_hub.tar.gz 
mouse_computer_room_hub.json
Extract nature_remo.tar.gz 
nature_remo.json
Extract panasonic_doorphone.tar.gz 
panasonic_doorphone.json
Extract philips_hue_bridge.tar.gz 
philips_hue_bridge.json
Extract 'planex_camera_one_shot!.tar.gz' 
planex_camera_one_shot!.json
Extract planex_smacam_outdoor.tar.gz 
planex_smacam

In [6]:
# Print the current working directory
print("Current Working Directory:", os.getcwd())

# List the contents of the current working directory
print("Contents of Current Directory:", os.listdir())

!cd

Current Working Directory: /content/KDDI-IoT-2019/ipfix
Contents of Current Directory: ['sony_network_camera.tar.gz', 'planex_smacam_outdoor.tar.gz', 'line_clova_wave.json', 'candy_house_sesami_wi-fi_access_point.tar.gz', 'mouse_computer_room_hub.json', 'qrio_hub.json', 'irobot_roomba.json', 'sony_bravia.tar.gz01', 'sony_smart_speaker.json', 'philips_hue_bridge.tar.gz', 'google_home_gen1.tar.gz00', 'qrio_hub.tar.gz', 'google_home_gen1.tar.gz01', 'i-o_data_qwatch.tar.gz01', 'philips_hue_bridge.json', 'planex_camera_one_shot!.tar.gz', 'amazon_echo_gen2.json', 'candy_house_sesami_wi-fi_access_point.json', 'planex_smacam_outdoor.json', 'jvc_kenwood_cu-hb1.tar.gz', 'au_network_camera.json', 'planex_camera_one_shot!.json', 'irobot_roomba.tar.gz', 'xiaomi_mijia_led.json', 'powerelectric_wi-fi_plug.tar.gz', 'jvc_kenwood_hdtv_ip_camera.json', 'bitfinder_awair_breathe_easy.json', 'mouse_computer_room_hub.tar.gz', 'line_clova_wave.tar.gz', 'sony_bravia.tar.gz00', 'jvc_kenwood_cu-hb1.json', 'au_wi

In [7]:
# Code to generate distinct tables for each json file using a limited subset

# Define the directory where the JSON files are located
json_directory = '/content/KDDI-IoT-2019/ipfix'

# Get the list of all JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Create distinct tables for each json file
tables = {}
for json_file in json_files:
    # strip .json suffix from device names
    device_name = json_file.split('.')[0]
    # Construct the full path to the JSON file
    json_path = os.path.join(json_directory, json_file)
    # Read the JSON file into a DataFrame, normalize the 'flows' column, and get the first 1000 rows
    df = pd.json_normalize(pd.read_json(json_path, lines=True, nrows=1000)['flows'])
    # Label the DataFrame with the device name
    df['Device'] = device_name
    tables[device_name] = df

# Concatenate all the Dataframes in the tables dictionary into a single Dataframe
df = pd.concat(tables.values(), ignore_index=True)
df.head()


Unnamed: 0,flowStartMilliseconds,flowEndMilliseconds,flowDurationMilliseconds,reverseFlowDeltaMilliseconds,protocolIdentifier,sourceIPv4Address,sourceTransportPort,packetTotalCount,octetTotalCount,flowAttributes,...,reverseNonEmptyPacketCount,reverseDataByteCount,reverseAverageInterarrivalTime,reverseFirstNonEmptyPacketSize,reverseLargePacketCount,reverseMaxPacketSize,reverseStandardDeviationPayloadLength,reverseStandardDeviationInterarrivalTime,reverseBytesPerPacket,Device
0,2019-06-25 08:36:41.276,2019-06-25 08:36:41.855,0.579,0.0,17,192.168.1.1,47452,2,156,1,...,,,,,,,,,,line_clova_wave
1,2019-06-25 08:37:43.962,2019-06-25 08:37:44.541,0.579,0.0,17,192.168.1.1,36500,2,156,1,...,,,,,,,,,,line_clova_wave
2,2019-06-25 08:38:33.938,2019-06-25 08:38:33.996,0.058,0.058,17,192.168.1.179,123,1,76,0,...,1.0,48.0,0.0,48.0,0.0,48.0,0.0,0.0,48.0,line_clova_wave
3,2019-06-25 08:38:46.701,2019-06-25 08:38:47.280,0.579,0.0,17,192.168.1.1,52465,2,156,1,...,,,,,,,,,,line_clova_wave
4,2019-06-25 08:39:49.253,2019-06-25 08:39:49.809,0.556,0.0,17,192.168.1.1,55452,2,156,1,...,,,,,,,,,,line_clova_wave


In [None]:
"""# Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN

 Reduce Memory Usage
  This code is used to reduce memory usage of our dataframe useful when running
  large datasets by reducing the size of the properties of the dataset by selecting
  smaller datatypes


def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2
    print("Memory usage became: ",mem_usg," MB")

    return df"""

'# Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN\n\n Reduce Memory Usage\n  This code is used to reduce memory usage of our dataframe useful when running\n  large datasets by reducing the size of the properties of the dataset by selecting\n  smaller datatypes\n\n\ndef reduce_memory_usage(df):\n    start_mem = df.memory_usage().sum() / 1024**2\n    print(\'Memory usage of dataframe is {:.2f} MB\'.format(start_mem))\n\n    for col in df.columns:\n        col_type = df[col].dtype.name\n        if ((col_type != \'datetime64[ns]\') & (col_type != \'category\')):\n            if (col_type != \'object\'):\n                c_min = df[col].min()\n                c_max = df[col].max()\n\n                if str(col_type)[:3] == \'int\':\n                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:\n                        df[col] = df[col].astype(np.int8)\n                    elif c_min > np.iinfo(np.int16).min and c_max

### Data Transformation

In [10]:
# Define the list of columns to be dropped
drop_columns = ['flowStartMilliseconds',
                'flowEndMilliseconds',
                'sourceIPv4Address',
                'sourceMacAddress',
                'destinationMacAddress'
]

# Drop the columns from the dataset
df = df.drop(columns=drop_columns)

In [11]:
"""
  Encode and transform categorical values since Decision Tree & Random Forest can't handle non-numeric
  categorical data directly
"""
# Ordinal encoding for IP addresses
def ip_to_ordinal(df, col_name):
    # Generate unique codes for each unique IP address
    codes, uniques = pd.factorize(df[col_name])
    df[col_name] = codes
    return df

#df = ip_to_ordinal(df, 'sourceIPv4Address')
df = ip_to_ordinal(df, 'destinationIPv4Address')

# Converting hex to int
for col in ['tcpSequenceNumber', 'reverseTcpSequenceNumber', 'vlanId', 'ipClassOfService']:
    df[col] = df[col].apply(lambda x: int(x, 16))

# One-hot encoding for categorical attributes without ordinal relationship
one_hot_cols = ['flowAttributes', 'initialTCPFlags', 'unionTCPFlags', 'reverseInitialTCPFlags',
                'reverseUnionTCPFlags', 'reverseFlowAttributes', 'collectorName', 'flowEndReason']

df = pd.get_dummies(df, columns=one_hot_cols)

# Label encoding for  the attributes that can be label encoded without any issues
label_cols = ['firstEightNonEmptyPacketDirections', 'Device']
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [12]:
# Separate the Device column as label
labels_df = df['Device'].copy()

# Drop the Device column from the original DataFrame
df = df.drop(columns=['Device'])

X = df
y = labels_df

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Applying the Decision Tree and Random Forest Algorithm

#### Assessing the Performance of the Multi-Class Classifier
- The print_score function is designmed to evaluate a multi-class classifier's performance using F1 score and ROC-AUC curve.
- We're using the 'weighted' average for the F1 score and 'ovr' (One-vs-rest) approach for the multi-class ROC-AUC.

In [13]:
# Evaluate how well a classifier model is performing on the training data
def print_score(cls, X_train, y_train, X_test, y_test, train=True):
    if train:
        X, y = X_train, y_train
        data_type = "Train"
    else:
        X, y = X_test, y_test
        data_type = "Test"

    # The model 'cls' uses the features X to make predictions
    pred_y = cls.predict(X)
    prob = cls.predict_proba(X)

    # Compute 'weighted' F1-Score
    f1Score = f1_score(y, pred_y, average='weighted')
    # Compute ROC-AUC for each class using 'One-vs-rest'
    roc_auc = roc_auc_score(y, prob, average='weighted', multi_class='ovr')

    # Print results
    print(f"=== {data_type} Data ===")
    print(f"Weighted F1 Score = {f1Score:.4f}\n")
    print(f'ROC-AUC score: {roc_auc:.4f}\n')
    print("_______________________________________________")
    print(classification_report(y, pred_y))
    print("\n")

Treating missing values: Using Imputation to fill in the missing values

In [14]:
# Using imputation to fill in the missing values
# Create impute object
imputer = SimpleImputer()

# Fit imputer on training data
imputer.fit(X_train)

# Impute training data
X_train = imputer.transform(X_train)

# Impute the test data
X_test = imputer.transform(X_test)

### Decision Tree Classifier

In [15]:
# Create Decision Tree Classifier object
dec_tree = DecisionTreeClassifier(random_state=42)

# Fit the model to the training data
dec_tree.fit(X_train, y_train)

# Print model performance on training data
print_score(dec_tree, X_train, y_train, X_test, y_test, train=True)

# Print model performance on test data
print_score(dec_tree, X_train, y_train, X_test, y_test, train=False)

=== Train Data ===
Weighted F1 Score = 0.9999

ROC-AUC score: 1.0000

_______________________________________________
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       794
           1       1.00      1.00      1.00       798
           2       1.00      1.00      1.00       786
           3       1.00      1.00      1.00       802
           4       1.00      1.00      1.00       792
           5       1.00      1.00      1.00       781
           6       1.00      1.00      1.00       799
           7       1.00      1.00      1.00       807
           8       1.00      1.00      1.00       820
           9       1.00      1.00      1.00       797
          10       1.00      1.00      1.00       831
          11       1.00      1.00      1.00       806
          12       1.00      1.00      1.00       814
          13       1.00      1.00      1.00       804
          14       1.00      1.00      1.00       786
          15     

### Random Forest

### Random Forest Hyperparameter tuning

In [17]:
features = list(X_train.columns)
# Extract feature importance
feature_importance_rf = random_search.feature_importances_
feature_importance = pd.DataFrame({'Feature': features, 'Importance': feature_importance_rf})

AttributeError: ignored

In [16]:
# Defining the hyperparameters
param_dist = {
    'n_estimators': randint(50, 250),
    'max_depth': [None] + list(randint(5, 50).rvs(10)),
    'min_samples_split': randint(2, 50),
    'min_samples_leaf': randint(1, 50),
    'bootstrap': [True, False],
    'max_features': ['auto', 'sqrt', 'log2'],
    'class_weight': [None, 'balanced']
}

# Initializing the Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Initializing the RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist,
                                   n_iter=10, scoring='f1_weighted',
                                   cv=5, random_state=42)

# Fit the RandomizedSearchCV onto the train dataset
random_search.fit(X_train, y_train)



# Print the best parameters from the RandomizedSearchCV
random_cv = random_search.best_estimator_
print(f"Best parameters: {random_search.best_params_}")


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best parameters: {'bootstrap': False, 'class_weight': None, 'max_depth': 38, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 25, 'n_estimators': 207}


In [None]:
# Evaluate the best Random Forest model for the test dataset
print_score(random_cv, X_train, y_train, X_test, y_test, train=False)

=== Test Data ===
Weighted F1 Score = 0.9828

ROC-AUC score: 0.9990

_______________________________________________
              precision    recall  f1-score   support

           0       0.99      0.99      0.99        90
           1       0.99      1.00      0.99        99
           2       0.94      0.93      0.94       105
           3       1.00      0.99      1.00       107
           4       1.00      1.00      1.00       105
           5       1.00      0.99      1.00       104
           6       1.00      0.96      0.98       121
           7       1.00      0.98      0.99       106
           8       1.00      0.99      1.00       105
           9       0.99      1.00      1.00       101
          10       1.00      1.00      1.00        85
          11       0.98      1.00      0.99        88
          12       1.00      0.94      0.97       103
          13       0.93      0.95      0.94       101
          14       1.00      1.00      1.00        94
          15      

#### Feature Importance
Lets try look at the feature importances of the random forest

### Logistic Regression

In [None]:
# Scale Features in Logistic Regression
# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler only on the training data
X_train_scaled = scaler.fit_transform(X_train)

# Use the scaler to transform the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the logistic regression model
logistic_reg = LogisticRegression(max_iter=1000, multi_class='ovr', solver='liblinear', random_state=42)

# Train the logistic regression model
logistic_reg.fit(X_train_scaled, y_train)


In [None]:
# Evaluate the logistic regression model on training data
print_score(logistic_reg, X_train_scaled, y_train, X_test_scaled, y_test, train=True)

# Evaluate the logistic regression model on test data
print_score(logistic_reg, X_train_scaled, y_train, X_test_scaled, y_test, train=False)

=== Train Data ===
Weighted F1 Score = 0.5975

ROC-AUC score: 0.9592

_______________________________________________
              precision    recall  f1-score   support

           0       0.92      0.52      0.66       910
           1       0.64      0.12      0.20       901
           2       0.99      0.65      0.79       895
           3       0.38      0.20      0.26       893
           4       0.19      0.75      0.30       895
           5       0.52      0.99      0.68       896
           6       0.86      0.91      0.89       879
           7       0.35      0.03      0.06       894
           8       0.58      0.93      0.71       895
           9       0.92      0.06      0.12       899
          10       1.00      1.00      1.00       915
          11       0.55      0.82      0.66       912
          12       0.79      0.84      0.81       897
          13       0.86      0.11      0.19       899
          14       1.00      1.00      1.00       906
          15     

### LightGBM