# REPLICATE PAPER MODELLING

## Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score

## Read data

In [2]:
# download parquet files from Google Drive to your local machine
# and read files into notebook
df_train = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_train.parquet')
df_test = pd.read_parquet('./data/cic_iomt_2024_wifi_mqtt_test.parquet')

In [3]:
df_train.head(5)

Unnamed: 0,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,...,Number,Magnitue,Radius,Covariance,Variance,Weight,label,class_label,category_label,attack_label
0,683.5,17.0,64.0,553148.44,553148.44,0.0,0.0,0.0,0.0,0.0,...,9.5,10.0,0.0,0.0,0.0,141.5,TCP_IP-DDoS-UDP2,Attack,DDos,DDoS_UDP
1,1745.92,16.12,64.0,29919.545,29919.545,0.0,0.0,0.0,0.0,0.04,...,9.5,10.05,2.152,11.785338,0.23,141.5,TCP_IP-DDoS-UDP2,Attack,DDos,DDoS_UDP
2,2830.36,15.79,64.0,27075.605,27075.605,0.0,0.0,0.0,0.0,0.06,...,9.5,10.35,10.555,113.67546,0.53,141.5,TCP_IP-DDoS-UDP2,Attack,DDos,DDoS_UDP
3,4087.5,16.89,64.0,19660.156,19660.156,0.0,0.0,0.0,0.0,0.0,...,9.5,10.03,1.236,4.490081,0.18,141.5,TCP_IP-DDoS-UDP2,Attack,DDos,DDoS_UDP
4,3916.0,17.0,64.0,270322.78,270322.78,0.0,0.0,0.0,0.0,0.0,...,9.5,10.0,0.0,0.0,0.0,141.5,TCP_IP-DDoS-UDP2,Attack,DDos,DDoS_UDP


## Final dataset

In [4]:
# Sample DataFrame with 12 rows
df_final = {
    'output_qty': ['2_classes']*4 + ['6_classes']*4 + ['19_classes']*4,
    'metric': ['Accuracy', 'Recall', 'Precision', 'F1_Score']*3
    }

df_final = pd.DataFrame(df_final)

# Add a new column with NaN values
df_final['LogisticRegression'] = np.nan
df_final['AdaBoost'] = np.nan
df_final['DeepNeuralNetwork'] = np.nan
df_final['RandomForest'] = np.nan

print(df_final)

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy                 NaN       NaN                NaN   
1    2_classes     Recall                 NaN       NaN                NaN   
2    2_classes  Precision                 NaN       NaN                NaN   
3    2_classes   F1_Score                 NaN       NaN                NaN   
4    6_classes   Accuracy                 NaN       NaN                NaN   
5    6_classes     Recall                 NaN       NaN                NaN   
6    6_classes  Precision                 NaN       NaN                NaN   
7    6_classes   F1_Score                 NaN       NaN                NaN   
8   19_classes   Accuracy                 NaN       NaN                NaN   
9   19_classes     Recall                 NaN       NaN                NaN   
10  19_classes  Precision                 NaN       NaN                NaN   
11  19_classes   F1_Score                 NaN       NaN         

## Logistic Regression

In [5]:
# Initialize the logistic regression model with specified parameters
logistic_model = LogisticRegression(
    penalty='l2',
    dual=False,
    tol=0.0001,
    C=1.0,
    fit_intercept=True,
    intercept_scaling=1,
    solver='lbfgs',
    max_iter=100,
    warm_start=False
)

### 2 classes

In [6]:
# 2 class label
label = 'class_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Attack', 'Benign']))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[0, 'LogisticRegression'] = accuracy
df_final.loc[1, 'LogisticRegression'] = recall
df_final.loc[2, 'LogisticRegression'] = precision
df_final.loc[3, 'LogisticRegression'] = f1

print(df_final)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

      Attack       0.99      0.99      0.99   1576575
      Benign       0.50      0.51      0.50     37607

    accuracy                           0.98   1614182
   macro avg       0.75      0.75      0.75   1614182
weighted avg       0.98      0.98      0.98   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822       NaN                NaN   
1    2_classes     Recall            0.976822       NaN                NaN   
2    2_classes  Precision            0.976929       NaN                NaN   
3    2_classes   F1_Score            0.976875       NaN                NaN   
4    6_classes   Accuracy                 NaN       NaN                NaN   
5    6_classes     Recall                 NaN       NaN                NaN   
6    6_classes  Precision                 NaN       NaN                NaN   
7    6_classes   F1_Score                 NaN    

## 6 Classes

In [7]:
# 6 class label
label = 'category_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[4, 'LogisticRegression'] = accuracy
df_final.loc[5, 'LogisticRegression'] = recall
df_final.loc[6, 'LogisticRegression'] = precision
df_final.loc[7, 'LogisticRegression'] = f1

print(df_final)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      Benign       0.69      0.08      0.15     37607
        DDos       0.68      1.00      0.81   1066764
         Dos       0.06      0.00      0.00    416676
        MQTT       0.08      0.01      0.01     63715
       Recon       0.00      0.00      0.00     27676
    Spoofing       0.02      0.43      0.04      1744

    accuracy                           0.66   1614182
   macro avg       0.26      0.25      0.17   1614182
weighted avg       0.49      0.66      0.54   1614182



  _warn_prf(average, modifier, msg_start, len(result))


    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822       NaN                NaN   
1    2_classes     Recall            0.976822       NaN                NaN   
2    2_classes  Precision            0.976929       NaN                NaN   
3    2_classes   F1_Score            0.976875       NaN                NaN   
4    6_classes   Accuracy            0.663100       NaN                NaN   
5    6_classes     Recall            0.663100       NaN                NaN   
6    6_classes  Precision            0.485078       NaN                NaN   
7    6_classes   F1_Score            0.539501       NaN                NaN   
8   19_classes   Accuracy                 NaN       NaN                NaN   
9   19_classes     Recall                 NaN       NaN                NaN   
10  19_classes  Precision                 NaN       NaN                NaN   
11  19_classes   F1_Score                 NaN       NaN         

## 19 Classes

In [8]:
# 19 class label
label = 'attack_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
logistic_model.fit(X_train, y_train)

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[8, 'LogisticRegression'] = accuracy
df_final.loc[9, 'LogisticRegression'] = recall
df_final.loc[10, 'LogisticRegression'] = precision
df_final.loc[11, 'LogisticRegression'] = f1

print(df_final)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

      ARP_spoofing       0.00      0.00      0.00      1744
            Benign       0.57      0.54      0.56     37607
         DDoS_ICMP       0.00      0.00      0.00    349699
          DDoS_SYN       0.00      0.00      0.00    172397
          DDoS_TCP       0.00      0.00      0.00    182598
          DDoS_UDP       0.23      1.00      0.37    362070
DDoS_connect_flood       0.00      0.00      0.00     41916
DDoS_publish_flood       0.00      0.00      0.00      8416
          DoS_ICMP       0.00      0.00      0.00     98432
           DoS_SYN       0.00      0.00      0.00     98595
           DoS_TCP       0.00      0.00      0.00     82096
           DoS_UDP       0.00      0.00      0.00    137553
 DoS_connect_flood       0.00      0.00      0.00      3131
 DoS_publish_flood       0.00      0.00      0.00      8505
    Malformed_date       0.00      0.00      0.00      1747
           OS_scan       0.00      0.00

  _warn_prf(average, modifier, msg_start, len(result))


    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822       NaN                NaN   
1    2_classes     Recall            0.976822       NaN                NaN   
2    2_classes  Precision            0.976929       NaN                NaN   
3    2_classes   F1_Score            0.976875       NaN                NaN   
4    6_classes   Accuracy            0.663100       NaN                NaN   
5    6_classes     Recall            0.663100       NaN                NaN   
6    6_classes  Precision            0.485078       NaN                NaN   
7    6_classes   F1_Score            0.539501       NaN                NaN   
8   19_classes   Accuracy            0.236882       NaN                NaN   
9   19_classes     Recall            0.236882       NaN                NaN   
10  19_classes  Precision            0.064710       NaN                NaN   
11  19_classes   F1_Score            0.096629       NaN         

## AdaBost

In [9]:
# Create a DecisionTreeClassifier as the base estimator
base_estimator = DecisionTreeClassifier()

# Initialize the AdaBoostClassifier
ada_boost = AdaBoostClassifier(
    base_estimator=base_estimator,  # Base estimator (DecisionTreeClassifier)
    n_estimators=50,                # Number of boosting stages
    learning_rate=1.0,              # Learning rate
    algorithm='SAMME.R'             # Boosting algorithm
)

### 2 classes

In [10]:
# 2 class label
label = 'class_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
ada_boost.fit(X_train, y_train)

# Predict on the test set
y_pred = ada_boost.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Attack', 'Benign']))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[0, 'AdaBoost'] = accuracy
df_final.loc[1, 'AdaBoost'] = recall
df_final.loc[2, 'AdaBoost'] = precision
df_final.loc[3, 'AdaBoost'] = f1

print(df_final)

              precision    recall  f1-score   support

      Attack       1.00      1.00      1.00   1576575
      Benign       0.98      0.95      0.96     37607

    accuracy                           1.00   1614182
   macro avg       0.99      0.97      0.98   1614182
weighted avg       1.00      1.00      1.00   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273                NaN   
1    2_classes     Recall            0.976822  0.998273                NaN   
2    2_classes  Precision            0.976929  0.998258                NaN   
3    2_classes   F1_Score            0.976875  0.998259                NaN   
4    6_classes   Accuracy            0.663100       NaN                NaN   
5    6_classes     Recall            0.663100       NaN                NaN   
6    6_classes  Precision            0.485078       NaN                NaN   
7    6_classes   F1_Score            0.539501    

### 6 classes

In [11]:
# 6 class label
label = 'category_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
ada_boost.fit(X_train, y_train)

# Predict on the test set
y_pred = ada_boost.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[4, 'AdaBoost'] = accuracy
df_final.loc[5, 'AdaBoost'] = recall
df_final.loc[6, 'AdaBoost'] = precision
df_final.loc[7, 'AdaBoost'] = f1

print(df_final)

              precision    recall  f1-score   support

      Benign       0.98      0.95      0.96     37607
        DDos       1.00      1.00      1.00   1066764
         Dos       1.00      1.00      1.00    416676
        MQTT       1.00      1.00      1.00     63715
       Recon       0.98      0.98      0.98     27676
    Spoofing       0.47      0.81      0.59      1744

    accuracy                           1.00   1614182
   macro avg       0.90      0.95      0.92   1614182
weighted avg       1.00      1.00      1.00   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273                NaN   
1    2_classes     Recall            0.976822  0.998273                NaN   
2    2_classes  Precision            0.976929  0.998258                NaN   
3    2_classes   F1_Score            0.976875  0.998259                NaN   
4    6_classes   Accuracy            0.663100  0.997812            

### 19 classes

In [12]:
# 19 class label
label = 'attack_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
ada_boost.fit(X_train, y_train)

# Predict on the test set
y_pred = ada_boost.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[8, 'AdaBoost'] = accuracy
df_final.loc[9, 'AdaBoost'] = recall
df_final.loc[10, 'AdaBoost'] = precision
df_final.loc[11, 'AdaBoost'] = f1

print(df_final)

                    precision    recall  f1-score   support

      ARP_spoofing       0.55      0.81      0.66      1744
            Benign       0.98      0.95      0.97     37607
         DDoS_ICMP       0.73      1.00      0.84    349699
          DDoS_SYN       1.00      1.00      1.00    172397
          DDoS_TCP       1.00      1.00      1.00    182598
          DDoS_UDP       1.00      0.64      0.78    362070
DDoS_connect_flood       1.00      1.00      1.00     41916
DDoS_publish_flood       1.00      1.00      1.00      8416
          DoS_ICMP       0.42      1.00      0.59     98432
           DoS_SYN       1.00      1.00      1.00     98595
           DoS_TCP       1.00      1.00      1.00     82096
           DoS_UDP       0.98      0.02      0.05    137553
 DoS_connect_flood       1.00      1.00      1.00      3131
 DoS_publish_flood       1.00      1.00      1.00      8505
    Malformed_date       0.93      0.87      0.90      1747
           OS_scan       0.81      0.69

## DeepNeuralNetwork

In [13]:
# Initialize the DNN (MLPClassifier) with specified parameters
dnn = MLPClassifier(
    hidden_layer_sizes=(32, 32, 32),  # Three hidden layers with 32 neurons each
    solver='adam',                    # Optimizer
    alpha=0.0001,                     # L2 regularization term (weight decay)
    batch_size='auto',                # Size of minibatches for stochastic optimizers
    learning_rate='constant',         # Learning rate schedule
    learning_rate_init=0.001,         # Initial learning rate
    power_t=0.5,                      # The exponent for inverse scaling learning rate
    max_iter=200,                     # Maximum number of iterations
    shuffle=True,                     # Whether to shuffle samples in each iteration
    tol=0.0001,                       # Tolerance for optimization convergence
    warm_start=False,                 # Reuse previous solution as initialization
    momentum=0.9,                     # Momentum for gradient descent
    nesterovs_momentum=True,          # Use Nesterov's momentum
    early_stopping=False,             # Do not use early stopping
    validation_fraction=0.1,          # Proportion of training data to set aside for validation
    beta_1=0.9,                       # Exponential decay rate for 1st moment estimates in Adam
    beta_2=0.999,                     # Exponential decay rate for 2nd moment estimates in Adam
    epsilon=1e-08,                    # Term to prevent division by zero in Adam
    n_iter_no_change=10,              # Number of iterations with no improvement to stop training
    max_fun=15000                     # Maximum number of loss function evaluations
)

### 2 classes

In [14]:
# 2 class label
label = 'class_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
dnn.fit(X_train, y_train)

# Predict on the test set
y_pred = dnn.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Attack', 'Benign']))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[0, 'DeepNeuralNetwork'] = accuracy
df_final.loc[1, 'DeepNeuralNetwork'] = recall
df_final.loc[2, 'DeepNeuralNetwork'] = precision
df_final.loc[3, 'DeepNeuralNetwork'] = f1

print(df_final)

              precision    recall  f1-score   support

      Attack       0.99      1.00      0.99   1576575
      Benign       0.85      0.49      0.62     37607

    accuracy                           0.99   1614182
   macro avg       0.92      0.75      0.81   1614182
weighted avg       0.98      0.99      0.98   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273           0.986160   
1    2_classes     Recall            0.976822  0.998273           0.986160   
2    2_classes  Precision            0.976929  0.998258           0.984804   
3    2_classes   F1_Score            0.976875  0.998259           0.984363   
4    6_classes   Accuracy            0.663100  0.997812                NaN   
5    6_classes     Recall            0.663100  0.997812                NaN   
6    6_classes  Precision            0.485078  0.998224                NaN   
7    6_classes   F1_Score            0.539501  0.

### 6 classes

In [15]:
# 6 class label
label = 'category_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
dnn.fit(X_train, y_train)

# Predict on the test set
y_pred = dnn.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[4, 'DeepNeuralNetwork'] = accuracy
df_final.loc[5, 'DeepNeuralNetwork'] = recall
df_final.loc[6, 'DeepNeuralNetwork'] = precision
df_final.loc[7, 'DeepNeuralNetwork'] = f1

print(df_final)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      Benign       0.84      0.50      0.63     37607
        DDos       0.68      1.00      0.81   1066764
         Dos       0.00      0.00      0.00    416676
        MQTT       0.98      0.00      0.00     63715
       Recon       1.00      0.43      0.60     27676
    Spoofing       0.00      0.00      0.00      1744

    accuracy                           0.68   1614182
   macro avg       0.58      0.32      0.34   1614182
weighted avg       0.52      0.68      0.56   1614182



  _warn_prf(average, modifier, msg_start, len(result))


    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273           0.986160   
1    2_classes     Recall            0.976822  0.998273           0.986160   
2    2_classes  Precision            0.976929  0.998258           0.984804   
3    2_classes   F1_Score            0.976875  0.998259           0.984363   
4    6_classes   Accuracy            0.663100  0.997812           0.679863   
5    6_classes     Recall            0.663100  0.997812           0.679863   
6    6_classes  Precision            0.485078  0.998224           0.521693   
7    6_classes   F1_Score            0.539501  0.997962           0.557702   
8   19_classes   Accuracy            0.236882  0.832482                NaN   
9   19_classes     Recall            0.236882  0.832482                NaN   
10  19_classes  Precision            0.064710  0.900925                NaN   
11  19_classes   F1_Score            0.096629  0.807172         

### 19 classes

In [16]:
# 19 class label
label = 'attack_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
dnn.fit(X_train, y_train)

# Predict on the test set
y_pred = dnn.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[8, 'DeepNeuralNetwork'] = accuracy
df_final.loc[9, 'DeepNeuralNetwork'] = recall
df_final.loc[10, 'DeepNeuralNetwork'] = precision
df_final.loc[11, 'DeepNeuralNetwork'] = f1

print(df_final)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

      ARP_spoofing       0.00      0.00      0.00      1744
            Benign       0.83      0.50      0.62     37607
         DDoS_ICMP       0.50      0.00      0.00    349699
          DDoS_SYN       0.24      0.00      0.00    172397
          DDoS_TCP       0.00      0.00      0.00    182598
          DDoS_UDP       0.23      1.00      0.37    362070
DDoS_connect_flood       0.00      0.00      0.00     41916
DDoS_publish_flood       0.00      0.00      0.00      8416
          DoS_ICMP       0.44      0.00      0.00     98432
           DoS_SYN       0.00      0.00      0.00     98595
           DoS_TCP       0.00      0.00      0.00     82096
           DoS_UDP       0.00      0.00      0.00    137553
 DoS_connect_flood       0.00      0.00      0.00      3131
 DoS_publish_flood       0.00      0.00      0.00      8505
    Malformed_date       0.00      0.00      0.00      1747
           OS_scan       0.56      0.03

  _warn_prf(average, modifier, msg_start, len(result))


    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273           0.986160   
1    2_classes     Recall            0.976822  0.998273           0.986160   
2    2_classes  Precision            0.976929  0.998258           0.984804   
3    2_classes   F1_Score            0.976875  0.998259           0.984363   
4    6_classes   Accuracy            0.663100  0.997812           0.679863   
5    6_classes     Recall            0.663100  0.997812           0.679863   
6    6_classes  Precision            0.485078  0.998224           0.521693   
7    6_classes   F1_Score            0.539501  0.997962           0.557702   
8   19_classes   Accuracy            0.236882  0.832482           0.242243   
9   19_classes     Recall            0.236882  0.832482           0.242243   
10  19_classes  Precision            0.064710  0.900925           0.245022   
11  19_classes   F1_Score            0.096629  0.807172         

## Random Forest

In [17]:
# Initialize the RandomForestClassifier with specified parameters
rf = RandomForestClassifier(
    n_estimators=100,                # Number of trees in the forest
    criterion='gini',                # Function to measure the quality of a split
    min_samples_split=2,             # Minimum number of samples required to split a node
    min_samples_leaf=1,              # Minimum number of samples required to be at a leaf node
    min_weight_fraction_leaf=0.0,    # Minimum weighted fraction of the input samples required at a leaf node
    max_features='sqrt',             # Number of features to consider when looking for the best split
    min_impurity_decrease=0.0,       # A node will be split if this split induces a decrease in impurity greater than or equal to this value
    bootstrap=True,                  # Whether bootstrap samples are used when building trees
    oob_score=False,                 # Whether to use out-of-bag samples to estimate the generalization accuracy
    warm_start=False,                # If True, reuse the solution of the previous call to fit and add more estimators
    ccp_alpha=0.0                    # Complexity parameter used for Minimal Cost-Complexity Pruning
)

### 2 classes

In [18]:
# 2 class label
label = 'class_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Attack', 'Benign']))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[0, 'RandomForest'] = accuracy
df_final.loc[1, 'RandomForest'] = recall
df_final.loc[2, 'RandomForest'] = precision
df_final.loc[3, 'RandomForest'] = f1

print(df_final)

              precision    recall  f1-score   support

      Attack       1.00      1.00      1.00   1576575
      Benign       0.98      0.97      0.97     37607

    accuracy                           1.00   1614182
   macro avg       0.99      0.98      0.99   1614182
weighted avg       1.00      1.00      1.00   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273           0.986160   
1    2_classes     Recall            0.976822  0.998273           0.986160   
2    2_classes  Precision            0.976929  0.998258           0.984804   
3    2_classes   F1_Score            0.976875  0.998259           0.984363   
4    6_classes   Accuracy            0.663100  0.997812           0.679863   
5    6_classes     Recall            0.663100  0.997812           0.679863   
6    6_classes  Precision            0.485078  0.998224           0.521693   
7    6_classes   F1_Score            0.539501  0.

### 6 classes

In [19]:
# 6 class label
label = 'category_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[4, 'RandomForest'] = accuracy
df_final.loc[5, 'RandomForest'] = recall
df_final.loc[6, 'RandomForest'] = precision
df_final.loc[7, 'RandomForest'] = f1

print(df_final)

              precision    recall  f1-score   support

      Benign       0.97      0.98      0.98     37607
        DDos       1.00      1.00      1.00   1066764
         Dos       1.00      1.00      1.00    416676
        MQTT       1.00      1.00      1.00     63715
       Recon       0.99      0.98      0.98     27676
    Spoofing       0.76      0.82      0.79      1744

    accuracy                           1.00   1614182
   macro avg       0.95      0.96      0.96   1614182
weighted avg       1.00      1.00      1.00   1614182

    output_qty     metric  LogisticRegression  AdaBoost  DeepNeuralNetwork  \
0    2_classes   Accuracy            0.976822  0.998273           0.986160   
1    2_classes     Recall            0.976822  0.998273           0.986160   
2    2_classes  Precision            0.976929  0.998258           0.984804   
3    2_classes   F1_Score            0.976875  0.998259           0.984363   
4    6_classes   Accuracy            0.663100  0.997812           0

### 19 classes

In [20]:
# 19 class label
label = 'attack_label'

# train features
X_train = df_train.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# train labels
y_train = df_train[label]

# test features
X_test = df_test.drop(columns=['label', 'class_label', 'category_label', 'attack_label'])
# test labels
y_test = df_test[label]

# Fit the model
rf.fit(X_train, y_train)

# Predict on the test set
y_pred = rf.predict(X_test)

# Print the classification report
print(classification_report(y_test, y_pred))

# fill in final data frame
# Calculate and print accuracy, recall, precision, and F1-score
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class
f1 = f1_score(y_test, y_pred, average='weighted')  # Set pos_label to your positive class

# update metrics in final data frame
df_final.loc[8, 'RandomForest'] = accuracy
df_final.loc[9, 'RandomForest'] = recall
df_final.loc[10, 'RandomForest'] = precision
df_final.loc[11, 'RandomForest'] = f1

print(df_final)

                    precision    recall  f1-score   support

      ARP_spoofing       0.73      0.84      0.78      1744
            Benign       0.97      0.99      0.98     37607
         DDoS_ICMP       1.00      1.00      1.00    349699
          DDoS_SYN       1.00      1.00      1.00    172397
          DDoS_TCP       1.00      1.00      1.00    182598
          DDoS_UDP       1.00      1.00      1.00    362070
DDoS_connect_flood       1.00      1.00      1.00     41916
DDoS_publish_flood       1.00      0.82      0.90      8416
          DoS_ICMP       1.00      1.00      1.00     98432
           DoS_SYN       1.00      1.00      1.00     98595
           DoS_TCP       1.00      1.00      1.00     82096
           DoS_UDP       1.00      1.00      1.00    137553
 DoS_connect_flood       1.00      1.00      1.00      3131
 DoS_publish_flood       0.85      1.00      0.92      8505
    Malformed_date       1.00      0.85      0.92      1747
           OS_scan       0.85      0.66

In [23]:
# Save DataFrame as CSV to the working directory
df_final.to_csv('table_8_recreation.csv', index=False)