## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import RobustScaler
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

## Import Dataset

In [2]:
df = pd.read_csv('CICIOT2023.csv')

In [3]:
df.columns

Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitude', 'Radius', 'Covariance', 'Variance', 'Weight', 'label',
       'Binary Class', 'Multiclass'],
      dtype='object')

In [4]:
df[['Binary Class']].value_counts()

Binary Class
Attack          7159423
Normal           172642
Name: count, dtype: int64

In [5]:
df = df.drop(columns=['Multiclass', 'label'])

In [6]:
df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitude,Radius,Covariance,Variance,Weight,Binary Class
0,0.000000,54.00,6,64.00,0.329807,0.329807,0.0,1,0,1,...,0.000000,54.00,8.334383e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,Attack
1,0.000000,57.04,6,64.00,4.290556,4.290556,0.0,0,0,0,...,2.822973,57.04,8.292607e+07,9.5,10.464666,4.010353,160.987842,0.05,141.55,Attack
2,0.000000,0.00,1,64.00,33.396799,33.396799,0.0,0,0,0,...,0.000000,42.00,8.312799e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,Attack
3,0.328175,76175.00,17,64.00,4642.133010,4642.133010,0.0,0,0,0,...,0.000000,50.00,8.301570e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,Attack
4,0.117320,101.73,6,65.91,6.202211,6.202211,0.0,0,1,0,...,23.113111,57.88,8.297300e+07,9.5,11.346876,32.716243,3016.808286,0.19,141.55,Attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7332060,0.000000,54.00,6,64.00,13.160665,13.160665,0.0,0,0,0,...,0.000000,54.00,8.333126e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,Attack
7332061,0.233475,17453.70,17,67.82,30034.276124,30034.276124,0.0,0,0,0,...,5.231095,52.46,8.301637e+07,9.5,10.157898,7.413133,264.972492,0.12,141.55,Attack
7332062,0.000000,54.00,6,64.00,0.953451,0.953451,0.0,0,0,0,...,0.000000,54.00,8.333178e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,Attack
7332063,0.000000,54.00,6,64.00,14.870483,14.870483,0.0,1,0,1,...,0.000000,54.00,8.334401e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,Attack


## Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
# Initialize the LabelEncoder
encoder = LabelEncoder()
df['Binary Class'] = encoder.fit_transform(df['Binary Class'])

## Feature Scaling

In [8]:
X = df.drop(columns=['Binary Class'])
y = df['Binary Class']

In [9]:
scaler = RobustScaler()
# Fit on training data
scaler.fit(X)
X = scaler.transform(X)

## Data Splitting

In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the testing set into testing and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the sizes of the resulting datasets
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 5132445
Validation set size: 1099810
Testing set size: 1099810


### Computing scale_pos_weight

In [11]:
scale_pos_weight = (1 / 0.0235) #0.0235 - benign class distribution

In [12]:
scale_pos_weight

42.5531914893617

### Model Training

In [13]:
import xgboost as xgb
# Create an XGBoost model with custom class weights
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
model.fit(X_train, y_train)

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print("Accuracy: ", accuracy)

Precision:  0.9959711995200249
Recall:  0.9951709840790682
F1 Score:  0.9953854192007159
Accuracy:  0.9951709840790682


In [15]:
# Predict on the test set
y_pred_train =  model.predict(X_train)
y_pred_test = model.predict(X_test)

In [16]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9955263086056081
F1 score on the test set:  0.9953854192007159


### Tuning reg_alpha (L1 Regularization)

In [17]:

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
 }

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END ........................................reg_alpha=0; total time=   8.9s
[CV] END ........................................reg_alpha=0; total time=   8.6s
[CV] END ........................................reg_alpha=0; total time=   9.3s
[CV] END ........................................reg_alpha=0; total time=   8.4s
[CV] END ........................................reg_alpha=0; total time=   8.5s
[CV] END ......................................reg_alpha=0.1; total time=  10.7s
[CV] END ......................................reg_alpha=0.1; total time=  10.4s
[CV] END ......................................reg_alpha=0.1; total time=   8.6s
[CV] END ......................................reg_alpha=0.1; total time=   8.2s
[CV] END ......................................reg_alpha=0.1; total time=   8.8s
[CV] END ......................................reg_alpha=0.2; total time=   8.3s
[CV] END ......................................r

### Tuning reg_lambda (L2 Regularization)

In [18]:
# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_lambda': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
 }

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END .......................................reg_lambda=0; total time=   8.8s
[CV] END .......................................reg_lambda=0; total time=   8.8s
[CV] END .......................................reg_lambda=0; total time=   8.4s
[CV] END .......................................reg_lambda=0; total time=   8.4s
[CV] END .......................................reg_lambda=0; total time=   8.4s
[CV] END .....................................reg_lambda=0.1; total time=   8.1s
[CV] END .....................................reg_lambda=0.1; total time=   8.4s
[CV] END .....................................reg_lambda=0.1; total time=   8.3s
[CV] END .....................................reg_lambda=0.1; total time=   8.1s
[CV] END .....................................reg_lambda=0.1; total time=   8.3s
[CV] END .....................................reg_lambda=0.2; total time=   8.5s
[CV] END .....................................re

### Model Evaluation (after hyperparameter tuning)

In [19]:
# Create an XGBoost model with custom class weights
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, reg_alpha=0.5, reg_lambda=0, random_state=42)
model.fit(X_train, y_train)

In [20]:
# Predict on the test set
y_pred_train =  model.predict(X_train)
y_pred = model.predict(X_test)

# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print("Accuracy: ", accuracy)


Precision:  0.9959420373575579
Recall:  0.9951455251361598
F1 Score:  0.995360041830391
Accuracy:  0.9951455251361598


In [21]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.9955500213414538
F1 score on the test set:  0.9953854192007159


### Classification Prediction

In [23]:
prediction = model.predict(X_test)


In [24]:
labels = ['attack', 'benign']
predicted_labels = [labels[pred] for pred in prediction]

In [25]:
# Assuming predicted_labels is your list of predicted labels
if 'attack' in predicted_labels:
    print("There are entries with the value 'attack' in the list.")
else:
    print("There are no entries with the value 'attack' in the list.")


There are entries with the value 'attack' in the list.


In [26]:
# Assuming predicted_labels is your list of predicted labels
attacks = [label for label in predicted_labels if label == 'attack']
attacks

['attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',

In [27]:
X_test

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 4.40022207e-01,  8.57486928e+01,  1.37500000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  8.23418008e-04,  0.00000000e+00, ...,
         2.93166706e-01,  2.50000000e-01,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 2.70154151e+02,  1.10443633e+04,  0.00000000e+00, ...,
         4.59427892e+04,  3.75000000e+00, -1.03050000e+02],
       [ 1.47560086e-01,  2.22322862e-03,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [28]:
# Assuming X_test is your features numpy array and y_pred contains your model predictions
# Convert X_test to a DataFrame for easier manipulation
df3 = pd.DataFrame(X_test, columns=[
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitude', 'Radius', 'Covariance', 'Variance', 'Weight'])

# Convert predictions into readable labels
labels = ['attack', 'benign']
predicted_labels = [labels[pred] for pred in y_pred]

# Add the predicted labels to the DataFrame
df3['Predicted Label'] = predicted_labels

# Now X_pred is the final DataFrame with features and their predicted labels
df3


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitude,Radius,Covariance,Variance,Weight,Predicted Label
0,0.000000,0.000000,0.000,0.00,1.292319,1.292319,0.0,0.0,1.0,0.0,...,0.000000,0.000000,0.884163,0.0,0.000000,0.000000,0.000000,0.00,0.00,attack
1,0.440022,85.748693,1.375,0.00,85.706521,85.706521,0.0,0.0,0.0,0.0,...,0.000000,-0.985222,-0.003707,0.0,-0.988872,0.000000,0.000000,0.00,0.00,attack
2,0.000000,0.000823,0.000,-0.22,-0.032220,-0.032220,0.0,0.0,0.0,0.0,...,0.236656,0.049261,0.761382,0.0,0.005082,0.249675,0.293167,0.25,0.00,attack
3,1.410888,112.524188,1.375,0.00,77.390177,77.390177,0.0,0.0,0.0,0.0,...,0.000000,-0.985222,-0.002843,0.0,-0.988872,0.000000,0.000000,0.00,0.00,attack
4,0.000000,0.000000,0.000,0.00,-0.088045,-0.088045,0.0,0.0,0.0,0.0,...,0.000000,0.000000,-0.636766,0.0,0.000000,0.000000,0.000000,0.00,0.00,attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1099805,0.000000,0.000000,0.000,0.00,0.010431,0.010431,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.695843,0.0,0.000000,0.000000,0.000000,0.00,0.00,attack
1099806,3.309898,136.156285,1.375,0.00,17.210323,17.210323,0.0,0.0,0.0,0.0,...,0.000000,-0.985222,-0.098117,0.0,-0.988872,0.000000,0.000000,0.00,0.00,attack
1099807,0.000000,0.000000,0.000,0.00,2.820781,2.820781,0.0,0.0,0.0,0.0,...,0.000000,0.000000,-0.177507,0.0,0.000000,0.000000,0.000000,0.00,0.00,attack
1099808,270.154151,11044.363292,0.000,0.00,0.598767,0.598767,0.0,0.0,0.0,0.0,...,366.949936,395.270936,-305.225846,-4.0,114.682325,382.496251,45942.789196,3.75,-103.05,benign


In [29]:
y_test

4043725    0
1206956    0
3401049    0
5679700    0
3013008    0
          ..
553043     0
1023441    0
7137779    0
5781969    1
857287     0
Name: Binary Class, Length: 1099810, dtype: int32

In [30]:
##############################

In [31]:
test = pd.read_csv('test_data.csv')
X_pred = test.drop(columns=['label',
       'Binary Class', 'Multiclass'])

In [32]:
scaler = RobustScaler()
# Fit on training data
scaler.fit(X_pred)
X_pred = scaler.transform(X_pred)

In [33]:
prediction = model.predict(X_pred)


In [34]:
labels = ['attack', 'benign']
predicted_labels = [labels[pred] for pred in prediction]

In [35]:
# Assuming predicted_labels is your list of predicted labels
if 'benign' in predicted_labels:
    print("There are entries with the value 'benign' in the list.")
else:
    print("There are no entries with the value 'benign' in the list.")


There are entries with the value 'benign' in the list.


In [36]:
predicted_labels

['attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'benign',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'benign',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'benign',
 'attack',
 'attack',
 'benign',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',
 'attack',

In [37]:
# Assuming X_test is your features numpy array and y_pred contains your model predictions
# Convert X_test to a DataFrame for easier manipulation
df1 = pd.DataFrame(X_pred, columns=[
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitude', 'Radius', 'Covariance', 'Variance', 'Weight'])

# Add the predicted labels to the DataFrame
df1['Predicted Label'] = predicted_labels

# Now X_pred is the final DataFrame with features and their predicted labels
df1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitude,Radius,Covariance,Variance,Weight,Predicted Label
0,2.019146,113.788825,1.375,0.00,45.132922,45.132922,0.0,0.0,0.0,0.0,...,0.000000,-0.985222,-0.066789,0.0,-0.989393,0.000000,0.000000,0.000,0.0,attack
1,26.079252,0.223334,0.000,0.00,-0.131608,-0.131608,0.0,0.0,1.0,0.0,...,0.000000,0.000000,0.871297,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
2,13.239858,1711.503412,1.250,1.08,1.575604,1.575604,0.0,0.0,0.0,0.0,...,1480.146796,209.354680,0.903580,0.0,81.858470,1538.526634,237463.840359,11.875,0.0,attack
3,0.000000,-0.002233,-0.125,-0.64,-0.113741,-0.113741,0.0,0.0,0.0,0.0,...,0.185151,0.014778,-0.178550,0.0,0.004102,0.193382,0.070505,0.625,0.0,attack
4,0.000000,-0.223334,-0.625,0.00,-0.130680,-0.130680,0.0,0.0,0.0,0.0,...,0.000000,-2.955665,0.015734,0.0,-3.094882,0.000000,0.000000,0.000,0.0,attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446790,0.000000,0.000000,0.000,0.00,-0.048189,-0.048189,0.0,0.0,0.0,0.0,...,0.000000,0.000000,-0.209351,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446791,44.645372,0.223334,0.000,0.00,-0.134241,-0.134241,0.0,0.0,0.0,0.0,...,0.000000,0.000000,-0.649953,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446792,0.000000,0.000000,0.000,0.00,0.009437,0.009437,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.762220,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446793,0.000000,-0.223334,-0.625,0.00,4.101880,4.101880,0.0,0.0,0.0,0.0,...,0.000000,-2.955665,0.094545,0.0,-3.094882,0.000000,0.000000,0.000,0.0,attack


In [38]:
test['Binary Class'].value_counts()

Binary Class
Attack    436375
Benign     10420
Name: count, dtype: int64

In [39]:
df1['Predicted Label'].value_counts()

Predicted Label
attack    434332
benign     12463
Name: count, dtype: int64

## Dashboard Development

In [40]:
import pickle

# For the baseline scenario or another model (make sure to define or adjust the scaler/model)
scaler_baseline = scaler
with open('robust_scaler.pkl', 'wb') as f:
    pickle.dump(scaler_baseline, f)

model_baseline = model
model_baseline.save_model('binary_allfeatures.json')