In [71]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Core libraries

In [72]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np

#Reading data and checking shape

In [73]:
x_train = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_train_2labels.pkl')
y_train = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_train_2labels_y.pkl')

x_val = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_val_2labels.pkl')
y_val = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_val_2labels_y.pkl')

x_test = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_test_2labels.pkl')
y_test = pd.read_pickle('/content/drive/MyDrive/CSCE464/datasets/CICIDS_test_2labels_y.pkl')


x_train.shape

(130028, 80)

In [74]:
y_train.shape

(130028,)

In [75]:
x_val.shape

(14448, 80)

In [76]:
y_val.shape

(14448,)

In [77]:
x_test.shape

(36119, 80)

In [78]:
y_test.shape

(36119,)

In [79]:
x_train = x_train.drop(['is_Attack', 'is_Normal'], axis=1)
x_val = x_val.drop(['is_Attack', 'is_Normal'], axis=1)
x_test = x_test.drop(['is_Attack', 'is_Normal'], axis=1)

In [80]:
x_train.replace([np.inf, -np.inf], np.nan, inplace=True)
x_val.replace([np.inf, -np.inf], np.nan, inplace=True)
x_test.replace([np.inf, -np.inf], np.nan, inplace=True)

In [81]:
x_train.fillna(999, inplace=True)
x_val.fillna(999, inplace=True)
x_test.fillna(999, inplace=True)

In [82]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)
x_test = scaler.transform(x_test)

#Decision Tree

In [83]:
#can add properties in parens, allow sklearn to do it
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)

In [84]:
y_pred = DT.predict(x_val)
print(y_pred)

[1 1 1 ... 1 0 1]


In [85]:
print('[Val] Accuracy: ', DT.score(x_val, y_val))
print('[Val] f1: ', f1_score(y_val, y_pred, average='weighted'))
#use validation dataset to check accuracy

[Val] Accuracy:  0.9998615725359912
[Val] f1:  0.9998615725359912


In [86]:
y_pred = DT.predict(x_test)
print('[Test] Accuracy: ', DT.score(x_test, y_test))
print('[Test] f1: ', f1_score(y_test, y_pred, average='weighted'))

[Test] Accuracy:  0.999778509925524
[Test] f1:  0.9997785177522924


In [87]:
confusion_matrix(y_test, y_pred)

array([[12784,     2],
       [    6, 23327]])

In [88]:
from sklearn.metrics import classification_report
target_names = ['is_Normal', 'is_Attack']
print(classification_report(y_test, y_pred, target_names = target_names, digits = 4))

              precision    recall  f1-score   support

   is_Normal     0.9995    0.9998    0.9997     12786
   is_Attack     0.9999    0.9997    0.9998     23333

    accuracy                         0.9998     36119
   macro avg     0.9997    0.9998    0.9998     36119
weighted avg     0.9998    0.9998    0.9998     36119



#Random Forest

In [89]:
#composed of many decision trees
RF = RandomForestClassifier()
RF.fit(x_train, y_train)

In [90]:
y_pred = RF.predict(x_val)
print(y_pred)

[1 1 1 ... 1 0 1]


In [91]:
print('[Val] Accuracy: ', RF.score(x_val, y_val))
print('[Val] f1: ', f1_score(y_val, y_pred, average='weighted'))
#use validation dataset to check accuracy

[Val] Accuracy:  0.9999307862679956
[Val] f1:  0.9999307879038949


In [92]:
y_pred = RF.predict(x_test)
print('[Test] Accuracy: ', RF.score(x_test, y_test))
print('[Test] f1: ', f1_score(y_test, y_pred, average='weighted'))

[Test] Accuracy:  0.999833882444143
[Test] f1:  0.9998338883142194


In [93]:
confusion_matrix(y_test, y_pred)

array([[12785,     1],
       [    5, 23328]])

In [94]:
from sklearn.metrics import classification_report
target_names = ['is_Normal', 'is_Attack']
print(classification_report(y_test, y_pred, target_names = target_names, digits = 4))

              precision    recall  f1-score   support

   is_Normal     0.9996    0.9999    0.9998     12786
   is_Attack     1.0000    0.9998    0.9999     23333

    accuracy                         0.9998     36119
   macro avg     0.9998    0.9999    0.9998     36119
weighted avg     0.9998    0.9998    0.9998     36119



#XGBoost

In [95]:
#composed of many decision trees
XGB = XGBClassifier()
XGB.fit(x_train, y_train)

In [96]:
y_pred = XGB.predict(x_val)
print(y_pred)

[1 1 1 ... 1 0 1]


In [97]:
print('[Val] Accuracy: ', XGB.score(x_val, y_val))
print('[Val] f1: ', f1_score(y_val, y_pred, average='weighted'))
#use validation dataset to check accuracy

[Val] Accuracy:  0.9998615725359912
[Val] f1:  0.9998615790778133


In [98]:
y_pred = XGB.predict(x_test)
print('[Test] Accuracy: ', XGB.score(x_test, y_test))
print('[Test] f1: ', f1_score(y_test, y_pred, average='weighted'))

[Test] Accuracy:  0.9998892549627619
[Test] f1:  0.9998892569198945


In [99]:
confusion_matrix(y_test, y_pred)

array([[12785,     1],
       [    3, 23330]])

In [100]:
from sklearn.metrics import classification_report
target_names = ['is_Normal', 'is_Attack']
print(classification_report(y_test, y_pred, target_names = target_names, digits = 4))

              precision    recall  f1-score   support

   is_Normal     0.9998    0.9999    0.9998     12786
   is_Attack     1.0000    0.9999    0.9999     23333

    accuracy                         0.9999     36119
   macro avg     0.9999    0.9999    0.9999     36119
weighted avg     0.9999    0.9999    0.9999     36119

