## Imports and Miscellaneous

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
import warnings
from pathlib import Path
import time
from datetime import datetime
import pickle
import json
import os

warnings.simplefilter("ignore", FutureWarning)
%config Application.warn_ignores = FutureWarning


## Notebook Start Time

In [2]:
notebook_start_time = time.time()

## Data

In [3]:
def create_mem_map(parquet_file, data_type='train'):
   
    df = pd.read_parquet(parquet_file)
    scaler = MinMaxScaler()
    
    mem_map_dir = Path('./mem_map_files')
    mem_map_dir.mkdir(exist_ok=True)
    
    mem_map_file_X = mem_map_dir / f'X_{data_type}.npy'
    mem_map_file_X_scaled = mem_map_dir / f'X_{data_type}_scaled.npy'
    mem_map_file_y_2 = mem_map_dir / f'y_{data_type}_2.npy'
    mem_map_file_y_6 = mem_map_dir / f'y_{data_type}_6.npy'
    mem_map_file_y_19 = mem_map_dir / f'y_{data_type}_19.npy'
    
    X = df.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
    np.save(mem_map_file_X, X)  

    X_scaled =  scaler.fit_transform(X)
    np.save(mem_map_file_X_scaled, X_scaled) 
    
    encoder_2 = LabelEncoder()
    encoder_6 = LabelEncoder()
    encoder_19 = LabelEncoder()

    y_2 = encoder_2.fit_transform(df['class_label'])  
    y_6 = encoder_6.fit_transform(df['category_label'])  
    y_19 = encoder_19.fit_transform(df['attack_label']) 
    
    np.save(mem_map_file_y_2, y_2)
    np.save(mem_map_file_y_6, y_6)
    np.save(mem_map_file_y_19, y_19)

    np.save(mem_map_dir / f'encoder_2_classes.npy', encoder_2.classes_)
    np.save(mem_map_dir / f'encoder_6_classes.npy', encoder_6.classes_)
    np.save(mem_map_dir / f'encoder_19_classes.npy', encoder_19.classes_)

In [4]:
mem_map_created = True

if mem_map_created == False:
    create_mem_map('./data/cic_iomt_2024_wifi_mqtt_train.parquet', data_type='train')
    create_mem_map('./data/cic_iomt_2024_wifi_mqtt_test.parquet', data_type='test')
    

In [5]:
mem_map_dir = Path('./mem_map_files')
X_train = np.load(mem_map_dir / 'X_train.npy', mmap_mode='r')
X_train_scaled = np.load(mem_map_dir / 'X_train_scaled.npy', mmap_mode='r')

y_train_2 = np.load(mem_map_dir / 'y_train_2.npy', mmap_mode='r')
y_train_6 = np.load(mem_map_dir / 'y_train_6.npy', mmap_mode='r')
y_train_19 = np.load(mem_map_dir / 'y_train_19.npy', mmap_mode='r')


X_test = np.load(mem_map_dir / 'X_test.npy', mmap_mode='r')
X_test_scaled = np.load(mem_map_dir / 'X_test_scaled.npy', mmap_mode='r')

y_test_2 = np.load(mem_map_dir / 'y_test_2.npy', mmap_mode='r')
y_test_6 = np.load(mem_map_dir / 'y_test_6.npy', mmap_mode='r')
y_test_19 = np.load(mem_map_dir / 'y_test_19.npy', mmap_mode='r')

encoder_2_classes = np.load('./mem_map_files/encoder_2_classes.npy', allow_pickle=True)
encoder_6_classes = np.load('./mem_map_files/encoder_6_classes.npy', allow_pickle=True)
encoder_19_classes = np.load('./mem_map_files/encoder_19_classes.npy', allow_pickle=True)

## Benchmarks

We use hyperparameters from the CICIoMT2024 to establish benchmarks for classification on the dataset. We will then use pso to select the best features and compare results.

### Benchmark Function

In [6]:
def calculate_benchmark(model='rf', X_train=None, X_test=None, y_train=None, y_test=None):
    if model == 'rf':
        classifier = RandomForestClassifier(n_estimators=10, max_depth=15, n_jobs=-1, random_state=1984)

    elif model == 'ada':
        classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=5), n_estimators=10, algorithm='SAMME.R', random_state=1984)

    elif model=='lr':
        classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
intercept_scaling=1, solver='lbfgs', max_iter=2000,
warm_start=False, n_jobs=-1)

        
    algo_start = time.time()
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    algo_end = time.time()

    if np.unique(y_train).size == 2:
         report = classification_report(encoder_2_classes[y_test], encoder_2_classes[y_pred], output_dict=True)
    elif np.unique(y_train).size == 6:
        report = classification_report(encoder_6_classes[y_test], encoder_6_classes[y_pred], output_dict=True)
    else:
        report = classification_report(encoder_19_classes[y_test], encoder_19_classes[y_pred], output_dict=True)

    return report, algo_end - algo_start

### Logistic Regression

#### Binary

In [7]:
lr_bk_report_2, lr_bk_time_2 = calculate_benchmark(model='lr', X_train=X_train_scaled, X_test=X_test_scaled, y_train=y_train_2, y_test=y_test_2)
print(pd.DataFrame(lr_bk_report_2).transpose())
print(lr_bk_time_2)

              precision    recall  f1-score       support
Attack         0.996752  0.997507  0.997129  1.576575e+06
Benign         0.892047  0.863749  0.877670  3.760700e+04
accuracy       0.994390  0.994390  0.994390  9.943903e-01
macro avg      0.944400  0.930628  0.937400  1.614182e+06
weighted avg   0.994313  0.994390  0.994346  1.614182e+06
32.510417222976685


#### 6 Classes

In [8]:
lr_bk_report_6, lr_bk_time_6 = calculate_benchmark(model='lr', X_train=X_train_scaled, X_test=X_test_scaled, y_train=y_train_6, y_test=y_test_6)
print(pd.DataFrame(lr_bk_report_6).transpose())
print(lr_bk_time_6)

              precision    recall  f1-score       support
Benign         0.882780  0.876113  0.879434  3.760700e+04
DDos           0.745593  0.953240  0.836726  1.066764e+06
Dos            0.597573  0.189349  0.287575  4.166760e+05
MQTT           0.984789  0.979534  0.982154  6.371500e+04
Recon          0.903569  0.505817  0.648567  2.767600e+04
Spoofing       0.330959  0.399656  0.362078  1.744000e+03
accuracy       0.747025  0.747025  0.747025  7.470248e-01
macro avg      0.740877  0.650618  0.666089  1.614182e+06
weighted avg   0.722283  0.747025  0.697968  1.614182e+06
1397.1565124988556


#### 19 Classes

In [9]:
lr_bk_report_19, lr_bk_time_19 = calculate_benchmark(model='lr', X_train=X_train_scaled, X_test=X_test_scaled, y_train=y_train_19, y_test=y_test_19)
print(pd.DataFrame(lr_bk_report_19).transpose())
print(lr_bk_time_19)

                    precision    recall  f1-score       support
ARP_spoofing         0.280689  0.457569  0.347940  1.744000e+03
Benign               0.869390  0.900391  0.884619  3.760700e+04
DDoS_ICMP            0.781447  0.998353  0.876682  3.496990e+05
DDoS_SYN             0.800830  0.780414  0.790490  1.723970e+05
DDoS_TCP             0.691320  0.998253  0.816907  1.825980e+05
DDoS_UDP             0.724111  0.992026  0.837156  3.620700e+05
DDoS_connect_flood   0.984193  0.993749  0.988948  4.191600e+04
DDoS_publish_flood   0.969900  0.103375  0.186836  8.416000e+03
DoS_ICMP             0.556285  0.006024  0.011920  9.843200e+04
DoS_SYN              0.656406  0.753314  0.701529  9.859500e+04
DoS_TCP              0.800623  0.009391  0.018565  8.209600e+04
DoS_UDP              0.143439  0.005118  0.009883  1.375530e+05
DoS_connect_flood    0.921734  0.944107  0.932786  3.131000e+03
DoS_publish_flood    0.533244  0.986361  0.692247  8.505000e+03
Malformed_date       0.713755  0.109903 

### AdaBoost

#### Binary

In [10]:
ada_bk_report_2, ada_bk_time_2 = calculate_benchmark(model='ada', X_train=X_train, X_test=X_test, y_train=y_train_2, y_test=y_test_2)
print(pd.DataFrame(ada_bk_report_2).transpose())
print(ada_bk_time_2)

              precision    recall  f1-score       support
Attack         0.998901  0.999311  0.999106  1.576575e+06
Benign         0.970591  0.953918  0.962182  3.760700e+04
accuracy       0.998253  0.998253  0.998253  9.982530e-01
macro avg      0.984746  0.976614  0.980644  1.614182e+06
weighted avg   0.998242  0.998253  0.998246  1.614182e+06
392.4339990615845


#### 6 Classes

In [11]:
ada_bk_report_6, ada_bk_time_6 = calculate_benchmark(model='ada', X_train=X_train, X_test=X_test, y_train=y_train_6, y_test=y_test_6)
print(pd.DataFrame(ada_bk_report_6).transpose())
print(ada_bk_time_6)

              precision    recall  f1-score       support
Benign         0.974289  0.931050  0.952179  3.760700e+04
DDos           0.999610  0.999824  0.999717  1.066764e+06
Dos            0.999575  0.998879  0.999227  4.166760e+05
MQTT           0.998118  0.990379  0.994233  6.371500e+04
Recon          0.908142  0.980561  0.942963  2.767600e+04
Spoofing       0.627128  0.633601  0.630348  1.744000e+03
accuracy       0.996879  0.996879  0.996879  9.968789e-01
macro avg      0.917810  0.922382  0.919778  1.614182e+06
weighted avg   0.996981  0.996879  0.996894  1.614182e+06
402.2824373245239


#### 19 Classes

In [12]:
ada_bk_report_19, ada_bk_time_19 = calculate_benchmark(model='ada', X_train=X_train, X_test=X_test, y_train=y_train_19, y_test=y_test_19)
print(pd.DataFrame(ada_bk_report_19).transpose())
print(ada_bk_time_19)

                    precision    recall  f1-score       support
ARP_spoofing         0.802332  0.828555  0.815233  1.744000e+03
Benign               0.949509  0.943096  0.946291  3.760700e+04
DDoS_ICMP            0.998082  0.999920  0.999000  3.496990e+05
DDoS_SYN             0.999623  0.998422  0.999022  1.723970e+05
DDoS_TCP             0.999627  0.998713  0.999170  1.825980e+05
DDoS_UDP             0.996418  0.639208  0.778807  3.620700e+05
DDoS_connect_flood   0.999905  0.999881  0.999893  4.191600e+04
DDoS_publish_flood   0.999761  0.994415  0.997081  8.416000e+03
DoS_ICMP             0.992273  0.999319  0.995784  9.843200e+04
DoS_SYN              0.999057  0.999300  0.999179  9.859500e+04
DoS_TCP              0.997012  0.999769  0.998388  8.209600e+04
DoS_UDP              0.511410  0.989408  0.674290  1.375530e+05
DoS_connect_flood    1.000000  0.998722  0.999361  3.131000e+03
DoS_publish_flood    0.994732  0.999059  0.996891  8.505000e+03
Malformed_date       0.975637  0.504293 

### Random Forest

#### Binary

In [13]:
rf_bk_report_2, rf_bk_time_2 = calculate_benchmark(model='rf', X_train=X_train, X_test=X_test, y_train=y_train_2, y_test=y_test_2)
print(pd.DataFrame(rf_bk_report_2).transpose())
print(rf_bk_time_2)

              precision    recall  f1-score       support
Attack         0.998940  0.999302  0.999121  1.576575e+06
Benign         0.970299  0.955567  0.962877  3.760700e+04
accuracy       0.998283  0.998283  0.998283  9.982833e-01
macro avg      0.984620  0.977435  0.980999  1.614182e+06
weighted avg   0.998273  0.998283  0.998277  1.614182e+06
19.148553371429443


#### 6 Classes

In [14]:
rf_bk_report_6, rf_bk_time_6 = calculate_benchmark(model='rf', X_train=X_train, X_test=X_test, y_train=y_train_6, y_test=y_test_6)
print(pd.DataFrame(rf_bk_report_6).transpose())
print(rf_bk_time_6)

              precision    recall  f1-score       support
Benign         0.956044  0.996490  0.975848  3.760700e+04
DDos           0.999829  0.999960  0.999895  1.066764e+06
Dos            0.999904  0.999561  0.999732  4.166760e+05
MQTT           0.999826  0.991446  0.995618  6.371500e+04
Recon          0.994308  0.965674  0.979782  2.767600e+04
Spoofing       0.868952  0.741399  0.800124  1.744000e+03
accuracy       0.998573  0.998573  0.998573  9.985727e-01
macro avg      0.969810  0.949088  0.958500  1.614182e+06
weighted avg   0.998592  0.998573  0.998563  1.614182e+06
16.590628623962402


#### 19 Classes

In [15]:
rf_bk_report_19, rf_bk_time_19 = calculate_benchmark(model='rf', X_train=X_train, X_test=X_test, y_train=y_train_19, y_test=y_test_19)
print(pd.DataFrame(rf_bk_report_19).transpose())
print(rf_bk_time_19)

                    precision    recall  f1-score       support
ARP_spoofing         0.572824  0.773509  0.658209  1.744000e+03
Benign               0.958927  0.980270  0.969481  3.760700e+04
DDoS_ICMP            0.999474  0.999963  0.999718  3.496990e+05
DDoS_SYN             0.999779  0.996276  0.998024  1.723970e+05
DDoS_TCP             0.998567  0.999710  0.999138  1.825980e+05
DDoS_UDP             0.998882  0.999484  0.999183  3.620700e+05
DDoS_connect_flood   0.999809  0.999928  0.999869  4.191600e+04
DDoS_publish_flood   0.999070  0.893417  0.943294  8.416000e+03
DoS_ICMP             0.999725  0.998740  0.999233  9.843200e+04
DoS_SYN              0.999635  0.999716  0.999675  9.859500e+04
DoS_TCP              0.999769  0.999951  0.999860  8.209600e+04
DoS_UDP              0.999295  0.999295  0.999295  1.375530e+05
DoS_connect_flood    1.000000  0.996806  0.998401  3.131000e+03
DoS_publish_flood    0.905269  1.000000  0.950279  8.505000e+03
Malformed_date       0.988430  0.684602 

#### 19 Classes

## Results

In [16]:
results_folder = 'results_data'

if not os.path.exists(results_folder):
    os.makedirs(results_folder)

In [17]:
data_to_save = {
    'lr_bk_report_2': lr_bk_report_2,
    'lr_bk_time_2': lr_bk_time_2,
    'lr_bk_report_6': lr_bk_report_6,
    'lr_bk_time_6': lr_bk_time_6,
    'lr_bk_report_19': lr_bk_report_19,
    'lr_bk_time_19': lr_bk_time_19,
    'ada_bk_report_2': ada_bk_report_2,
    'ada_bk_time_2': ada_bk_time_2,
    'ada_bk_report_6': ada_bk_report_6,
    'ada_bk_time_6': ada_bk_time_6,
    'ada_bk_report_19': ada_bk_report_19,
    'ada_bk_time_19': ada_bk_time_19,
    'rf_bk_report_2': rf_bk_report_2,
    'rf_bk_time_2': rf_bk_time_2,
    'rf_bk_report_6': rf_bk_report_6,
    'rf_bk_time_6': rf_bk_time_6,
    'rf_bk_report_19': rf_bk_report_19,
    'rf_bk_time_19': rf_bk_time_19,
}

In [18]:
for name, data in data_to_save.items():
    file_path = os.path.join(results_folder, f'{name}.pkl')
    with open(file_path, 'wb') as f:
        pickle.dump(data, f)

In [19]:
for key in data_to_save:
    if isinstance(data_to_save[key], np.integer):
        data_to_save[key] = int(data_to_save[key])
    elif isinstance(data_to_save[key], np.ndarray):
        data_to_save[key] = data_to_save[key].tolist()

for name, data in data_to_save.items():
    file_path = os.path.join(results_folder, f'{name}.json') 
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

## Notebook Duration

In [20]:
notebook_end_time = time.time()
print(f"Notebook Duration: {(notebook_end_time - notebook_start_time) / 60} minutes")

Notebook Duration: 109.115935921669 minutes
