<a href="https://colab.research.google.com/github/BChun11/DATA3001/blob/main/DATA3001_RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
# Download all the files into google colab environment
!git clone https://github.com/nokuik/KDDI-IoT-2019.git

Cloning into 'KDDI-IoT-2019'...
remote: Enumerating objects: 42, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 42 (delta 1), reused 9 (delta 1), pack-reused 33[K
Receiving objects: 100% (42/42), 776.84 MiB | 32.00 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Updating files: 100% (31/31), done.


In [3]:
# List all '.tar.gz files in ipfix directory
%cd KDDI-IoT-2019
%cd ipfix
!ls *.tar.gz

/content/KDDI-IoT-2019
/content/KDDI-IoT-2019/ipfix
 amazon_echo_gen2.tar.gz		        nature_remo.tar.gz
 au_network_camera.tar.gz		        panasonic_doorphone.tar.gz
 au_wireless_adapter.tar.gz		        philips_hue_bridge.tar.gz
 bitfinder_awair_breathe_easy.tar.gz	       'planex_camera_one_shot!.tar.gz'
 candy_house_sesami_wi-fi_access_point.tar.gz   planex_smacam_outdoor.tar.gz
 irobot_roomba.tar.gz			        planex_smacam_pantilt.tar.gz
 jvc_kenwood_cu-hb1.tar.gz		        powerelectric_wi-fi_plug.tar.gz
 jvc_kenwood_hdtv_ip_camera.tar.gz	        qrio_hub.tar.gz
 line_clova_wave.tar.gz			        sony_network_camera.tar.gz
 link_japan_eremote.tar.gz		        sony_smart_speaker.tar.gz
 mouse_computer_room_hub.tar.gz		        xiaomi_mijia_led.tar.gz


In [4]:
# List all tar.gz files and store them in a variable
files = !ls -1 *.tar.gz

# Extract each tar.gz file
for file in files:
    print(f"Extract {file} ")
    !tar -xzvf {file}

Extract amazon_echo_gen2.tar.gz 
amazon_echo_gen2.json
Extract au_network_camera.tar.gz 
au_network_camera.json
Extract au_wireless_adapter.tar.gz 
au_wireless_adapter.json
Extract bitfinder_awair_breathe_easy.tar.gz 
bitfinder_awair_breathe_easy.json
Extract candy_house_sesami_wi-fi_access_point.tar.gz 
candy_house_sesami_wi-fi_access_point.json
Extract irobot_roomba.tar.gz 
irobot_roomba.json
Extract jvc_kenwood_cu-hb1.tar.gz 
jvc_kenwood_cu-hb1.json
Extract jvc_kenwood_hdtv_ip_camera.tar.gz 
jvc_kenwood_hdtv_ip_camera.json
Extract line_clova_wave.tar.gz 
line_clova_wave.json
Extract link_japan_eremote.tar.gz 
link_japan_eremote.json
Extract mouse_computer_room_hub.tar.gz 
mouse_computer_room_hub.json
Extract nature_remo.tar.gz 
nature_remo.json
Extract panasonic_doorphone.tar.gz 
panasonic_doorphone.json
Extract philips_hue_bridge.tar.gz 
philips_hue_bridge.json
Extract 'planex_camera_one_shot!.tar.gz' 
planex_camera_one_shot!.json
Extract planex_smacam_outdoor.tar.gz 
planex_smacam

In [5]:
# Print the current working directory
print("Current Working Directory:", os.getcwd())

# List the contents of the current working directory
print("Contents of Current Directory:", os.listdir())

!cd

Current Working Directory: /content/KDDI-IoT-2019/ipfix
Contents of Current Directory: ['sony_smart_speaker.tar.gz', 'jvc_kenwood_cu-hb1.json', 'bitfinder_awair_breathe_easy.json', 'mouse_computer_room_hub.tar.gz', 'jvc_kenwood_hdtv_ip_camera.tar.gz', 'google_home_gen1.tar.gz00', 'au_network_camera.tar.gz', 'sony_network_camera.json', 'xiaomi_mijia_led.json', 'candy_house_sesami_wi-fi_access_point.tar.gz', 'planex_camera_one_shot!.json', 'link_japan_eremote.json', 'link_japan_eremote.tar.gz', 'au_wireless_adapter.json', 'planex_smacam_pantilt.tar.gz', 'sony_network_camera.tar.gz', 'planex_smacam_outdoor.json', 'jvc_kenwood_hdtv_ip_camera.json', 'philips_hue_bridge.json', 'jvc_kenwood_cu-hb1.tar.gz', 'sony_bravia.tar.gz01', 'planex_smacam_outdoor.tar.gz', 'qrio_hub.json', 'philips_hue_bridge.tar.gz', 'amazon_echo_gen2.json', 'candy_house_sesami_wi-fi_access_point.json', 'line_clova_wave.json', 'irobot_roomba.json', 'i-o_data_qwatch.tar.gz01', 'au_network_camera.json', 'line_clova_wave.t

In [36]:
# Code to generate distinct tables for each json file using a limited subset

# Define the directory where the JSON files are located
json_directory = '/content/KDDI-IoT-2019/ipfix'

# Get the list of all JSON files in the directory
json_files = [f for f in os.listdir(json_directory) if f.endswith('.json')]

# Create distinct tables for each json file
tables = {}
for json_file in json_files:
    # strip .json suffix from device names
    device_name = json_file.split('.')[0]
    # Construct the full path to the JSON file
    json_path = os.path.join(json_directory, json_file)
    # Read the JSON file into a DataFrame, normalize the 'flows' column, and get the first 1000 rows
    df = pd.json_normalize(pd.read_json(json_path, lines=True, nrows=1000)['flows'])

    # Label the DataFrame with the device name
    df['Device'] = device_name
    tables[device_name] = df

# Concatenate all the Dataframes in the tables dictionary into a single Dataframe
df = pd.concat(tables.values(), ignore_index=True)
df.head()

Unnamed: 0,flowStartMilliseconds,flowEndMilliseconds,flowDurationMilliseconds,reverseFlowDeltaMilliseconds,protocolIdentifier,sourceIPv4Address,sourceTransportPort,packetTotalCount,octetTotalCount,flowAttributes,...,reverseNonEmptyPacketCount,reverseDataByteCount,reverseAverageInterarrivalTime,reverseFirstNonEmptyPacketSize,reverseLargePacketCount,reverseMaxPacketSize,reverseStandardDeviationPayloadLength,reverseStandardDeviationInterarrivalTime,reverseBytesPerPacket,Device
0,2019-06-25 08:36:16.241,2019-06-25 08:36:17.427,1.186,0.178,6,192.168.1.186,55656,14,1841,0,...,7.0,6458.0,100.0,1402.0,6.0,1402.0,565.0,102.0,922.0,jvc_kenwood_cu-hb1
1,2019-06-25 08:36:33.530,2019-06-25 08:36:34.688,1.158,0.172,6,192.168.1.186,56823,14,1841,0,...,7.0,6458.0,98.0,1402.0,6.0,1402.0,565.0,100.0,922.0,jvc_kenwood_cu-hb1
2,2019-06-25 08:36:53.166,2019-06-25 08:36:54.340,1.174,0.178,6,192.168.1.186,55658,14,1841,0,...,7.0,6458.0,110.0,1402.0,6.0,1402.0,565.0,100.0,922.0,jvc_kenwood_cu-hb1
3,2019-06-25 08:37:12.861,2019-06-25 08:37:14.047,1.186,0.176,6,192.168.1.186,55659,14,1841,0,...,7.0,6458.0,112.0,1402.0,6.0,1402.0,565.0,103.0,922.0,jvc_kenwood_cu-hb1
4,2019-06-25 08:37:32.566,2019-06-25 08:37:33.723,1.157,0.173,6,192.168.1.186,56826,14,1841,0,...,7.0,6458.0,82.0,1402.0,6.0,1402.0,565.0,101.0,922.0,jvc_kenwood_cu-hb1


In [7]:
# Reference: https://www.kaggle.com/code/arjanso/reducing-dataframe-memory-size-by-65 @ARJANGROEN

"""Reduce Memory Usage
  This code is used to reduce memory usage of our dataframe useful when running
  large datasets by reducing the size of the properties of the dataset by selecting
  smaller datatypes
"""

def reduce_memory_usage(df):

    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        if ((col_type != 'datetime64[ns]') & (col_type != 'category')):
            if (col_type != 'object'):
                c_min = df[col].min()
                c_max = df[col].max()

                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        pass
            else:
                df[col] = df[col].astype('category')
    mem_usg = df.memory_usage().sum() / 1024**2
    print("Memory usage became: ",mem_usg," MB")

    return df

In [None]:
# Apply the reduce memory usage function to our dataset
df = reduce_memory_usage(df)
df.info()

Memory usage of dataframe is 9.23 MB
Memory usage became:  3.813507080078125  MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 55 columns):
 #   Column                                    Non-Null Count  Dtype   
---  ------                                    --------------  -----   
 0   flowStartMilliseconds                     22000 non-null  category
 1   flowEndMilliseconds                       22000 non-null  category
 2   flowDurationMilliseconds                  22000 non-null  float16 
 3   reverseFlowDeltaMilliseconds              22000 non-null  float16 
 4   protocolIdentifier                        22000 non-null  int8    
 5   sourceIPv4Address                         22000 non-null  category
 6   sourceTransportPort                       22000 non-null  int32   
 7   packetTotalCount                          22000 non-null  int16   
 8   octetTotalCount                           22000 non-null  int32   
 9   flowAttribute

### Data Transformation

In [37]:
# Convert variable flowStartMilliseconds & flowEndMilliseconds into datetime type
for column in ['flowStartMilliseconds', 'flowEndMilliseconds']:
    df[column] = pd.to_datetime(df[column])
    df[column] = df[column].apply(lambda x: x.timestamp())

In [38]:
"""
  Encode categorical values since Decision Tree & Random Forest can't handle non-numeric
  categorical data directly
"""
# Identify Object (typically string) Columns
object_col = df.select_dtypes(['object']).columns

# Create a dictionary to store LabelEncoders for each column (useful if you need inverse transformation later)
label_encoders = {}

for col in object_col:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le


In [39]:
# Separate the Device column and flowStartMilliseconds as labels
labels_df = df['Device'].copy()

# Drop the Device column from the original DataFrame
df = df.drop(columns=['Device'])

X = df
y = labels_df

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Applying the Decision Tree and Random Forest Algorithm

#### Assessing the Performance of Classifier

In [40]:
"""
  Prints the accuracy, classification report, and confusion matrix for a classifier on the training or testing data.
  :param cls: Classifier to evaluate
  :param X_train: Training data features
  :param y_train: True labels for training data
  :param X_test: Testing data features
  :param y_test: True labels for testing data
  :param train: Boolean indicating whether to evaluate on the training or testing data
"""
def print_score(cls, X_train, y_train, X_test, y_test, train=True):
    # Check whether we are using the train or test data for evaluation
    if train:
        X, y, data_type = X_train, y_train, "Train"
    else:
        X, y, data_type = X_test, y_test, "Test"

    # The model 'cls' uses the features X to make predictions
    pred = cls.predict(X)

    # Generate reports for performance metrics (Accuracy Score, Classification Report and Confusion Matrix)
    cls_report = pd.DataFrame(classification_report(y, pred, output_dict=True))

    # Print results
    print(f"{data_type} Result:\n{'='*48}")
    print(f"Accuracy Score: {accuracy_score(y, pred) * 100:.2f}%")
    print("_______________________________________________")
    print(f"CLASSIFICATION REPORT:\n{cls_report}")
    print("_______________________________________________")
    print(f"Confusion Matrix: \n {confusion_matrix(y, pred)}\n")


In [41]:
df.info(1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 54 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   flowStartMilliseconds                     22000 non-null  float64
 1   flowEndMilliseconds                       22000 non-null  float64
 2   flowDurationMilliseconds                  22000 non-null  float64
 3   reverseFlowDeltaMilliseconds              22000 non-null  float64
 4   protocolIdentifier                        22000 non-null  int64  
 5   sourceIPv4Address                         22000 non-null  int64  
 6   sourceTransportPort                       22000 non-null  int64  
 7   packetTotalCount                          22000 non-null  int64  
 8   octetTotalCount                           22000 non-null  int64  
 9   flowAttributes                            22000 non-null  int64  
 10  sourceMacAddress                  

#### Decision Tree Classifier

In [45]:
from sklearn.impute import SimpleImputer

# Create an imputer object with a median filling strategy
imputer = SimpleImputer(strategy='median')

# Apply the imputer to our data
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)  # If you've split the data into train and test

# Then, convert the result back to a dataframe
X_train = pd.DataFrame(X_train_imputed, columns=X_train.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns)

# Now, train your model
dec_tree = DecisionTreeClassifier(random_state=42)
dec_tree.fit(X_train, y_train)

print_score(dec_tree, X_train, y_train, X_test, y_test, train=True)
print_score(dec_tree, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 100.00%
_______________________________________________
CLASSIFICATION REPORT:
               0      1      2      3      4      5      6      7      8  \
precision    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
recall       1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
f1-score     1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
support    901.0  897.0  899.0  914.0  910.0  894.0  905.0  889.0  896.0   

               9  ...     15     16     17     18     19     20     21  \
precision    1.0  ...    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
recall       1.0  ...    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
f1-score     1.0  ...    1.0    1.0    1.0    1.0    1.0    1.0    1.0   
support    894.0  ...  897.0  893.0  895.0  895.0  895.0  915.0  879.0   

           accuracy  macro avg  weighted avg  
precision       1.0        1.0           1.0  
recall          1.0        1.0           1.0 