In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import train_test_split
import warnings
from joblib import Parallel, delayed
import time

## Global Variables

In [21]:
CLASSIFIER = ''

## Notebook Start Time

In [22]:
start_time = time.time()

## Data

In [35]:
my_frac=0.01

# Load dataset as pandas DataFrame
df_train = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_train.parquet')
df_test = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_test.parquet')  

# Create sample DataFrame for feature selection
df_train_sample = df_train.sample(frac=my_frac, random_state=1984)      
df_test_sample = df_test.sample(frac=my_frac, random_state=1984)   

# Create sample X and y from train and test, convert to numpy arrays
X_train_sample = df_train_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_train_sample_2 = df_train_sample['class_label'].to_numpy()
y_train_sample_6 = df_train_sample['category_label'].to_numpy()
y_train_sample_19 = df_train_sample['attack_label'].to_numpy()


X_test_sample = df_test_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_test_sample_2 = df_test_sample['class_label'].to_numpy()
y_test_sample_6 = df_test_sample['category_label'].to_numpy()
y_test_sample_19 = df_test_sample['attack_label'].to_numpy()


# Create full data X and y from train and test, convert to numpy arrays
X_train_full = df_train_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_train_full_2 = df_train_sample['class_label'].to_numpy()
y_train_full_6 = df_train_sample['category_label'].to_numpy()
y_train_full_19 = df_train_sample['attack_label'].to_numpy()


X_test_full = df_test_sample.drop(columns=['label', 'class_label', 'category_label', 'attack_label']).to_numpy()
y_test_full_2 = df_test_sample['class_label'].to_numpy()
y_test_full_6 = df_test_sample['category_label'].to_numpy()
y_test_full_19 = df_test_sample['attack_label'].to_numpy()

## Benchmarks

We use hyperparameters from the CICIoMT2024 to establish benchmarks for classification on the dataset. We will then use pso to select the best features and compare results.

### Logistic Regression

In [36]:
lr_benchmark = LogisticRegression(
    penalty='l2', 
    dual=False, 
    tol=0.0001, 
    C=1.0, 
    fit_intercept=True, 
    intercept_scaling=1, 
    solver='lbfgs', 
    max_iter=100,
    warm_start=False, 
    n_jobs=-1
    )

#### Binary Classification

In [37]:
lr_benchmark.fit(X_train_sample, y_train_sample_2)
y_pred_lr_benchmark_2 = lr_benchmark.predict(X_test_sample)

In [52]:
# print the classification report
report_lr_benchmark_2 = classification_report(y_test_sample_2, y_pred_lr_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_lr_benchmark_2))

# calculate accuracy
accuracy_lr_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_lr_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_2:.5f}")

              precision    recall  f1-score   support

      Attack       0.99      0.99      0.99     15803
      Benign       0.54      0.45      0.50       339

    accuracy                           0.98     16142
   macro avg       0.77      0.72      0.74     16142
weighted avg       0.98      0.98      0.98     16142

Accuracy: 0.98055


#### 6 Class Classification

In [39]:
lr_benchmark.fit(X_train_sample, y_train_sample_6)
y_pred_lr_benchmark_6 = lr_benchmark.predict(X_test_sample)

In [53]:
# print the classification report
report_lr_benchmark_6 = classification_report(y_test_sample_6, y_pred_lr_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_lr_benchmark_6))

# calculate accuracy
accuracy_lr_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_lr_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_6:.5f}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      Benign       0.69      0.07      0.13       339
        DDos       0.68      1.00      0.81     10633
         Dos       0.08      0.00      0.00      4256
        MQTT       0.15      0.02      0.03       623
       Recon       0.00      0.00      0.00       274
    Spoofing       0.03      0.47      0.05        17

    accuracy                           0.66     16142
   macro avg       0.27      0.26      0.17     16142
weighted avg       0.49      0.66      0.54     16142

Accuracy: 0.66119


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### 19 Class Classification

In [41]:
lr_benchmark.fit(X_train_sample, y_train_sample_19)
y_pred_lr_benchmark_19 = lr_benchmark.predict(X_test_sample)

In [54]:
# print the classification report
report_lr_benchmark_19 = classification_report(y_test_sample_19, y_pred_lr_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_lr_benchmark_19))

# calculate accuracy
accuracy_lr_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_lr_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_lr_benchmark_19:.5f}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

      ARP_spoofing       0.00      0.00      0.00        17
            Benign       0.54      0.53      0.54       339
         DDoS_ICMP       0.00      0.00      0.00      3456
          DDoS_SYN       0.00      0.00      0.00      1792
          DDoS_TCP       0.00      0.00      0.00      1849
          DDoS_UDP       0.22      1.00      0.37      3536
DDoS_connect_flood       0.00      0.00      0.00       408
DDoS_publish_flood       0.00      0.00      0.00        87
          DoS_ICMP       0.00      0.00      0.00      1047
           DoS_SYN       0.00      0.00      0.00      1007
           DoS_TCP       0.00      0.00      0.00       852
           DoS_UDP       0.00      0.00      0.00      1350
 DoS_connect_flood       0.00      0.00      0.00        30
 DoS_publish_flood       0.00      0.00      0.00        82
    Malformed_date       0.00      0.00      0.00        16
           OS_scan       0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### AdaBoost

In [45]:
ada_benchmark = AdaBoostClassifier(
    DecisionTreeClassifier(), 
    n_estimators=50,
    learning_rate=1.0, 
    algorithm='SAMME.R', 
    random_state=1984
    )

#### Binary Classification

In [46]:
ada_benchmark.fit(X_train_sample, y_train_sample_2)
y_pred_ada_benchmark_2 = ada_benchmark.predict(X_test_sample)

In [47]:
# print the classification report
report_ada_benchmark_2 = classification_report(y_test_sample_2, y_pred_ada_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_ada_benchmark_2))

# calculate accuracy
accuracy_ada_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_ada_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_2:.5f}")

              precision    recall  f1-score   support

      Attack       1.00      1.00      1.00     15803
      Benign       0.96      0.94      0.95       339

    accuracy                           1.00     16142
   macro avg       0.98      0.97      0.98     16142
weighted avg       1.00      1.00      1.00     16142

Accuracy: 0.99802


#### 6 Class Classification

In [48]:
ada_benchmark.fit(X_train_sample, y_train_sample_6)
y_pred_ada_benchmark_6 = ada_benchmark.predict(X_test_sample)

In [49]:
# print the classification report
report_ada_benchmark_6 = classification_report(y_test_sample_6, y_pred_ada_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_ada_benchmark_6))

# calculate accuracy
accuracy_ada_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_ada_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_6:.5f}")

              precision    recall  f1-score   support

      Benign       0.96      0.96      0.96       339
        DDos       1.00      1.00      1.00     10633
         Dos       1.00      1.00      1.00      4256
        MQTT       1.00      0.99      1.00       623
       Recon       0.96      0.96      0.96       274
    Spoofing       0.63      0.71      0.67        17

    accuracy                           1.00     16142
   macro avg       0.93      0.94      0.93     16142
weighted avg       1.00      1.00      1.00     16142

Accuracy: 0.99783


#### 19 Class Classification

In [50]:
ada_benchmark.fit(X_train_sample, y_train_sample_19)
y_pred_ada_benchmark_19 = ada_benchmark.predict(X_test_sample)

In [51]:
report_ada_benchmark_19 = classification_report(y_test_sample_19, y_pred_ada_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_ada_benchmark_19))

# calculate accuracy
accuracy_ada_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_ada_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_ada_benchmark_19:.5f}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

      ARP_spoofing       0.44      0.88      0.59        17
            Benign       0.97      0.91      0.94       339
         DDoS_ICMP       0.73      1.00      0.85      3456
          DDoS_SYN       1.00      1.00      1.00      1792
          DDoS_TCP       1.00      1.00      1.00      1849
          DDoS_UDP       1.00      0.64      0.78      3536
DDoS_connect_flood       1.00      1.00      1.00       408
DDoS_publish_flood       1.00      0.09      0.17        87
          DoS_ICMP       0.44      1.00      0.61      1047
           DoS_SYN       1.00      1.00      1.00      1007
           DoS_TCP       1.00      1.00      1.00       852
           DoS_UDP       1.00      0.03      0.05      1350
 DoS_connect_flood       1.00      1.00      1.00        30
 DoS_publish_flood       0.51      0.99      0.67        82
    Malformed_date       0.82      0.88      0.85        16
           OS_scan       0.72      0.81

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest

In [55]:
rf_benchmark = RandomForestClassifier(
    n_estimators=100, 
    criterion='gini', 
    min_samples_split=2, 
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0, 
    max_features='sqrt', 
    min_impurity_decrease=0.0,
    bootstrap=True, 
    oob_score=False, 
    warm_start=False, 
    ccp_alpha=0.0, 
    n_jobs=-1, 
    random_state=1984
    )

#### Binary Classification

In [56]:
rf_benchmark.fit(X_train_sample, y_train_sample_2) 
y_pred_rf_benchmark_2 = rf_benchmark.predict(X_test_sample)   

In [57]:
# print the classification report
report_rf_benchmark_2 = classification_report(y_test_sample_2, y_pred_rf_benchmark_2, output_dict=True)
print(classification_report(y_test_sample_2, y_pred_rf_benchmark_2))

# calculate accuracy
accuracy_rf_benchmark_2 = accuracy_score(y_test_sample_2, y_pred_rf_benchmark_2)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_2:.5f}")

              precision    recall  f1-score   support

      Attack       1.00      1.00      1.00     15803
      Benign       0.96      0.94      0.95       339

    accuracy                           1.00     16142
   macro avg       0.98      0.97      0.97     16142
weighted avg       1.00      1.00      1.00     16142

Accuracy: 0.99796


#### 6 Class Classification

In [58]:
rf_benchmark.fit(X_train_sample, y_train_sample_6) 
y_pred_rf_benchmark_6 = rf_benchmark.predict(X_test_sample)   

In [59]:
# print the classification report
report_rf_benchmark_6 = classification_report(y_test_sample_6, y_pred_rf_benchmark_6, output_dict=True)
print(classification_report(y_test_sample_6, y_pred_rf_benchmark_6))

# calculate accuracy
accuracy_rf_benchmark_6 = accuracy_score(y_test_sample_6, y_pred_rf_benchmark_6)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_6:.5f}")

              precision    recall  f1-score   support

      Benign       0.94      0.99      0.96       339
        DDos       1.00      1.00      1.00     10633
         Dos       1.00      1.00      1.00      4256
        MQTT       1.00      0.99      1.00       623
       Recon       0.99      0.95      0.97       274
    Spoofing       0.69      0.65      0.67        17

    accuracy                           1.00     16142
   macro avg       0.94      0.93      0.93     16142
weighted avg       1.00      1.00      1.00     16142

Accuracy: 0.99789


#### 19 Class Classification

In [60]:
rf_benchmark.fit(X_train_sample, y_train_sample_19) 
y_pred_rf_benchmark_19 = rf_benchmark.predict(X_test_sample)    

In [61]:
# print the classification report
report_rf_benchmark_19 = classification_report(y_test_sample_19, y_pred_rf_benchmark_19, output_dict=True)
print(classification_report(y_test_sample_19, y_pred_rf_benchmark_19))

# calculate accuracy
accuracy_rf_benchmark_19 = accuracy_score(y_test_sample_19, y_pred_rf_benchmark_19)

# print accuracy with 5 decimal places
print(f"Accuracy: {accuracy_rf_benchmark_19:.5f}")

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

      ARP_spoofing       0.44      0.65      0.52        17
            Benign       0.92      0.96      0.94       339
         DDoS_ICMP       1.00      1.00      1.00      3456
          DDoS_SYN       1.00      1.00      1.00      1792
          DDoS_TCP       1.00      1.00      1.00      1849
          DDoS_UDP       1.00      1.00      1.00      3536
DDoS_connect_flood       1.00      1.00      1.00       408
DDoS_publish_flood       1.00      0.09      0.17        87
          DoS_ICMP       1.00      1.00      1.00      1047
           DoS_SYN       1.00      1.00      1.00      1007
           DoS_TCP       1.00      1.00      1.00       852
           DoS_UDP       1.00      1.00      1.00      1350
 DoS_connect_flood       1.00      1.00      1.00        30
 DoS_publish_flood       0.51      1.00      0.67        82
    Malformed_date       1.00      0.75      0.86        16
           OS_scan       0.71      0.47

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## GA Code

## Feature Selection With GA

### GA Variables

In [None]:
# Genetic Algorithm parameters
population_size = 20            # number of individuals in the population
n_generations = 50              # maximum number of generations
mutation_rate = 0.1             # probability of mutation
fitness_threshold = 1           # fitness goal (threshold for stopping)
stagnation_limit = 5            # number of generations without improvement before stopping

### Fitness Function

In [63]:
# define the fitness function for evaluating feature subsets
def fitness_function(individual):
    # function selects the features based on the individual's genes - features with values > 0.5 are selected
    selected_features = np.where(individual == 1)[0]  # select features based on individual genes
    if len(selected_features) == 0:                   # avoid empty feature set
        return 0
    X_train_selected = X_train_sample[:, selected_features]
    X_test_selected = X_test_sample[:, selected_features]

    if CLASSIFIER == 'lr':
        lr = LogisticRegression(**lr_benchmark.get_params())
        lr.fit(X_train_selected, y_train_sample)
        y_pred = lr.predict(X_test_selected)

    elif CLASSIFIER == 'ada':
        ada = AdaBoostClassifier(estimator=DecisionTreeClassifier(), n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=1984)
        ada.fit(X_train_selected, y_train_sample)               
        y_pred = ada.predict(X_test_selected) 
    
    elif CLASSIFIER == 'rf':
        rf = RandomForestClassifier(**rf_benchmark.get_params())
        rf.fit(X_train_selected, y_train_sample)               
        y_pred = rf.predict(X_test_selected) 

    accuracy = accuracy_score(y_test_sample, y_pred)      
    return accuracy  

### Generate Population and Objects for Collecting Results

In [64]:
# get the number of features for individual length
n_features = X_train.shape[1]   # number of features in dataset (45)

# initialize population with random values between 0 and 1 (individuals represent feature subsets)
# population = np.random.rand(population_size, n_features)
population = np.random.randint(2, size=(population_size, n_features))

# initialize variables to track the best fitness and stagnation count
best_fitness_overall = 0
best_individual_overall = None
no_improvement_count = 0
termination_reason = None  # To store the reason for termination