## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


## Import Dataset

In [2]:
df = pd.read_csv('CICIOT2023.csv')

In [3]:
df.columns

Index(['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count', 'syn_count',
       'fin_count', 'urg_count', 'rst_count', 'HTTP', 'HTTPS', 'DNS', 'Telnet',
       'SMTP', 'SSH', 'IRC', 'TCP', 'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC',
       'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number',
       'Magnitude', 'Radius', 'Covariance', 'Variance', 'Weight', 'label',
       'Binary Class', 'Multiclass'],
      dtype='object')

In [4]:
df[['Binary Class']].value_counts()

Binary Class
Attack          7159423
Benign           172642
Name: count, dtype: int64

In [5]:
df = df.drop(columns=['Multiclass', 'label', 
                      'fin_count', 'ack_count', 'HTTP', 'psh_flag_number', 'UDP',
                      'syn_flag_number', 'rst_flag_number', 'ICMP', 'SSH', 'DNS',
                      'fin_flag_number', 'LLC', 'IPv', 'ARP', 'ece_flag_number',
                      'cwr_flag_number', 'DHCP', 'IRC', 'Drate', 'Telnet', 'SMTP'])

In [6]:
df.shape

(7332065, 26)

## Encoding

In [7]:
from sklearn.preprocessing import LabelEncoder
# Initialize the OneHotEncoder
encoder = LabelEncoder()
df['Binary Class'] = encoder.fit_transform(df['Binary Class'])

## Feature Scaling

In [8]:
X = df.drop(columns=['Binary Class'])
y = df['Binary Class']

In [9]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
# Fit on training data
scaler.fit(X)
X = scaler.transform(X)

## Data Splitting

In [10]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Further split the testing set into testing and validation sets
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

# Print the sizes of the resulting datasets
print("Training set size:", X_train.shape[0])
print("Validation set size:", X_val.shape[0])
print("Testing set size:", X_test.shape[0])

Training set size: 5132445
Validation set size: 1099810
Testing set size: 1099810


### Computing scale_pos_weight

In [11]:
scale_pos_weight = (1 / 0.0235)

In [12]:
scale_pos_weight

42.5531914893617

### Model Training

In [13]:
import xgboost as xgb
# Create an XGBoost model with custom class weights
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42)
model.fit(X_train, y_train)

In [14]:
# Predict on the test set
y_pred = model.predict(X_test)
# Evaluate the model
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Precision:  0.9958447900104413
Recall:  0.99499277147871
F1 Score:  0.9952219728827152


In [15]:
# Predict on the test set
y_pred_train =  model.predict(X_train)
y_pred_test = model.predict(X_test)

In [16]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.995351350537901
F1 score on the test set:  0.9952219728827152


### Tuning reg_alpha (L1 Regularization)

In [17]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_alpha': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
 }

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END ........................................reg_alpha=0; total time=   7.2s
[CV] END ........................................reg_alpha=0; total time=   7.4s
[CV] END ........................................reg_alpha=0; total time=   7.3s
[CV] END ........................................reg_alpha=0; total time=   9.1s
[CV] END ........................................reg_alpha=0; total time=   7.7s
[CV] END ......................................reg_alpha=0.1; total time=   7.5s
[CV] END ......................................reg_alpha=0.1; total time=   7.5s
[CV] END ......................................reg_alpha=0.1; total time=   7.0s
[CV] END ......................................reg_alpha=0.1; total time=   7.0s
[CV] END ......................................reg_alpha=0.1; total time=   8.4s
[CV] END ......................................reg_alpha=0.2; total time=   8.0s
[CV] END ......................................r

### Tuning reg_lambda (L2 Regularization)

In [18]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'reg_lambda': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
 }

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=make_scorer(f1_score, average='weighted'), cv=5, verbose=2)

# Fit the grid search to the validation data
grid_search.fit(X_val, y_val)

# Print the best parameters and the best average F1 score
print("Best parameters found: ", grid_search.best_params_)
print("Best average F1 score found: ", grid_search.best_score_)


Fitting 5 folds for each of 11 candidates, totalling 55 fits
[CV] END .......................................reg_lambda=0; total time=   7.2s
[CV] END .......................................reg_lambda=0; total time=   7.3s
[CV] END .......................................reg_lambda=0; total time=   7.1s
[CV] END .......................................reg_lambda=0; total time=   7.2s
[CV] END .......................................reg_lambda=0; total time=   7.1s
[CV] END .....................................reg_lambda=0.1; total time=   6.7s
[CV] END .....................................reg_lambda=0.1; total time=   6.9s
[CV] END .....................................reg_lambda=0.1; total time=   6.9s
[CV] END .....................................reg_lambda=0.1; total time=   6.8s
[CV] END .....................................reg_lambda=0.1; total time=   6.9s
[CV] END .....................................reg_lambda=0.2; total time=   6.8s
[CV] END .....................................re

### Model Evaluation (after hyperparameter tuning)

In [19]:
import xgboost as xgb
# Create an XGBoost model with custom class weights
model = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight, reg_alpha=0.9, reg_lambda=0.3, random_state=42)
model.fit(X_train, y_train)

In [20]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))
print("F1 Score: {:.4f}".format(f1))

Accuracy: 0.9951
Precision: 0.9959
Recall: 0.9951
F1 Score: 0.9953


In [21]:
# Calculate F1 score on the training set
f1_train = f1_score(y_train, y_pred_train, average='weighted')
# Calculate F1 score on the test set
f1_test = f1_score(y_test, y_pred_test, average='weighted')
print("F1 score on the training set: ", f1_train)
print("F1 score on the test set: ", f1_test)

F1 score on the training set:  0.995351350537901
F1 score on the test set:  0.9952219728827152


## Classification Test 

In [22]:
test = pd.read_csv('test_data.csv')

In [23]:
test = test.drop(columns=['fin_count', 'ack_count', 'HTTP', 'psh_flag_number', 'UDP',
                      'syn_flag_number', 'rst_flag_number', 'ICMP', 'SSH', 'DNS',
                      'fin_flag_number', 'LLC', 'IPv', 'ARP', 'ece_flag_number',
                      'cwr_flag_number', 'DHCP', 'IRC', 'Drate', 'Telnet', 'SMTP'])

In [24]:
X_pred = test.drop(columns=['label',
       'Binary Class', 'Multiclass'])

In [25]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
# Fit on training data
scaler.fit(X_pred)
X_pred = scaler.transform(X_pred)

In [26]:
prediction = model.predict(X_pred)


In [27]:
labels = ['attack', 'benign']
predicted_labels = [labels[pred] for pred in prediction]

In [28]:
# Assuming predicted_labels is your list of predicted labels
if 'benign' in predicted_labels:
    print("There are entries with the value 'benign' in the list.")
else:
    print("There are no entries with the value 'benign' in the list.")


There are entries with the value 'benign' in the list.


In [29]:
# Assuming X_test is your features numpy array and y_pred contains your model predictions
# Convert X_test to a DataFrame for easier manipulation
df1 = pd.DataFrame(X_pred, columns=['flow_duration', 'Header_Length', 'Protocol Type', 'Duration', 'Rate',
       'Srate', 'ack_flag_number', 'syn_count', 'urg_count', 'rst_count',
       'HTTPS', 'TCP', 'Tot sum', 'Min', 'Max', 'AVG', 'Std', 'Tot size',
       'IAT', 'Number', 'Magnitude', 'Radius', 'Covariance', 'Variance',
       'Weight'])

# Add the predicted labels to the DataFrame
df1['Predicted Label'] = predicted_labels

# Now X_pred is the final DataFrame with features and their predicted labels
df1


Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,ack_flag_number,syn_count,urg_count,rst_count,...,Std,Tot size,IAT,Number,Magnitude,Radius,Covariance,Variance,Weight,Predicted Label
0,2.019146,113.788825,1.375,0.00,45.132922,45.132922,0.0,0.000000,0.00,0.0,...,0.000000,-0.985222,-0.066789,0.0,-0.989393,0.000000,0.000000,0.000,0.0,attack
1,26.079252,0.223334,0.000,0.00,-0.131608,-0.131608,0.0,33.333333,0.00,0.0,...,0.000000,0.000000,0.871297,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
2,13.239858,1711.503412,1.250,1.08,1.575604,1.575604,0.0,0.000000,0.06,7.0,...,1480.146796,209.354680,0.903580,0.0,81.858470,1538.526634,237463.840359,11.875,0.0,attack
3,0.000000,-0.002233,-0.125,-0.64,-0.113741,-0.113741,0.0,0.000000,0.00,0.0,...,0.185151,0.014778,-0.178550,0.0,0.004102,0.193382,0.070505,0.625,0.0,attack
4,0.000000,-0.223334,-0.625,0.00,-0.130680,-0.130680,0.0,0.000000,0.00,0.0,...,0.000000,-2.955665,0.015734,0.0,-3.094882,0.000000,0.000000,0.000,0.0,attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446790,0.000000,0.000000,0.000,0.00,-0.048189,-0.048189,0.0,0.000000,0.00,0.0,...,0.000000,0.000000,-0.209351,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446791,44.645372,0.223334,0.000,0.00,-0.134241,-0.134241,0.0,0.000000,0.00,0.0,...,0.000000,0.000000,-0.649953,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446792,0.000000,0.000000,0.000,0.00,0.009437,0.009437,1.0,0.000000,1.00,100.0,...,0.000000,0.000000,0.762220,0.0,0.000000,0.000000,0.000000,0.000,0.0,attack
446793,0.000000,-0.223334,-0.625,0.00,4.101880,4.101880,0.0,0.000000,0.00,0.0,...,0.000000,-2.955665,0.094545,0.0,-3.094882,0.000000,0.000000,0.000,0.0,attack


In [30]:
test['Binary Class'].value_counts()

Binary Class
Attack    436375
Benign     10420
Name: count, dtype: int64

In [31]:
df1['Predicted Label'].value_counts()

Predicted Label
attack    434244
benign     12551
Name: count, dtype: int64

## Dashboard Development

In [32]:
import pickle
# For the Random Forest (RF) classifier
scaler_rf = scaler
with open('robust_scaler_rf.pkl', 'wb') as f:
    pickle.dump(scaler_rf, f)

model_rf = model
model_rf.save_model('binary_rf_top25.json')