In [1]:
#Author     : Fan Li
#Update Date: 30/11/2022
#Version    : 3.1

#Intrusion Detection System Using Machine Learning 
#RandomForest、GBDT、Lightgbm、Xgboost、CatBoost 


#Proposed Framework 
#stacking (Xgboost + LogisticRegression) with SMOTE 

#Original Data (cic ids 2017)

#Synthenic Minority Over-sampling Technique (SMOTE)


#hardware
#

#System
#Ubuntu 20.04 server 64bit

In [2]:
#1. Loading Module
#basic
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import gc    #memory manage

import math
import numpy as np
import pandas as pd

#draw figure
import seaborn as sns
import matplotlib.pyplot as plt

#preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

#Feature Selection

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

#model
from sklearn.neighbors import KNeighborsClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV

#metrics
from sklearn import model_selection,feature_selection,utils,ensemble,metrics

#over-sampling and under sampling
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE

#time
from time import time
import datetime

In [3]:
#2. Reading Dataset
base_path="../../data/"
# Monday benign
f1_path=base_path+"Monday-WorkingHours.pcap_ISCX.csv"
f1=pd.read_csv(f1_path)
#Tuesday benign
f2_path=base_path+"Tuesday-WorkingHours.pcap_ISCX.csv"
f2=pd.read_csv(f2_path)
#Wed benign
f3_path=base_path+"Wednesday-workingHours.pcap_ISCX.csv"
f3=pd.read_csv(f3_path)
#Thur WebAttack
f4_path1=base_path+"Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv"
f41=pd.read_csv(f4_path1)
f4_path2=base_path+"Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
f42=pd.read_csv(f4_path2)
#Fri Benign DDos PortScan
f5_path1=base_path+"Friday-WorkingHours-Morning.pcap_ISCX.csv"
f51=pd.read_csv(f5_path1)

f5_path2=base_path+"Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"
f52=pd.read_csv(f5_path2)

f5_path3=base_path+"Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
f53=pd.read_csv(f5_path3)

In [None]:
data=pd.concat((f1,f2,f3,f41,f42,f51,f52,f53))
data.reset_index(inplace=True,drop=True) # duplicate index exist, reset index
data.shape

In [None]:
data

In [None]:
# too few samples in some subclasses of Web Attack, and they are combined into one class.
#Tuesday Brute Force
data[' Label'].replace("FTP-Patator","BruteForce",inplace=True,regex=True)
data[' Label'].replace("SSH-Patator","BruteForce",inplace=True,regex=True)
#Thursday Web Attack
data[' Label'].replace("Web Attack .*","Web Attack",inplace=True,regex=True)
#DoS 
data[' Label'].replace("DoS .*","DoS",inplace=True,regex=True)
data[' Label'].replace("DDoS","DoS",inplace=True,regex=True)

In [None]:
target_count_new=data[' Label'].value_counts(0)
target_count_new

In [None]:
labels_new,values_new=zip(*sorted(target_count_new.items()))

In [None]:
plt.figure(dpi=300,figsize=(15,6))
plt.bar(labels_new,values_new)
for x_new,y_new in zip(labels_new,values_new):
    plt.text(x_new,y_new,y_new,ha="center",va="bottom")
plt.xticks(fontsize=12,rotation=-90)
plt.xlabel("8 Categories",fontsize=16)
plt.ylabel("Count",fontsize=16)
plt.show()

In [None]:
#3. Preprocessing
#3.1 remove useless data
#remove duplicate row
data=data.drop_duplicates()
#remove rows with null value and infinite value
data=data.replace(np.inf,np.nan)
data=data.dropna()

In [None]:
data[' Label'].value_counts()

In [None]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]
print(Counter(y))

In [None]:
#Label Encoding
le=LabelEncoder()
le=le.fit(y)
le.classes_
y=le.transform(y)
print(Counter(y))

In [None]:
#Ram Clean
del data
gc.collect()

In [None]:
#3.2 Splitting data
X_train,X_test,y_train,y_test=model_selection.train_test_split(X,y,test_size=0.33,random_state=123456)

In [None]:
print(Counter(y_train))

In [None]:
#3.3 SMOTE
time0=time()
sm=SMOTE(k_neighbors=9,random_state=42,n_jobs=-1)
X_train,y_train=sm.fit_resample(X_train,y_train)

In [None]:
print("Time of SMOTE : {:.3f} s".format(time()-time0))

In [None]:
#4. Training
#4.1 Estimators (level 0)

In [None]:
rfc=RandomForestClassifier(random_state=12345)

In [None]:
xtc=ExtraTreesClassifier(random_state=12345)

In [None]:
gbc=GradientBoostingClassifier(n_estimators=10,random_state=123456)

In [None]:
xgbc=XGBClassifier(n_estimators=100,random_state=12345)

In [None]:
lgbc=LGBMClassifier(random_state=12345)

In [None]:
cbc=CatBoostClassifier(random_state=12345)

In [None]:
#estimators=[("GradientBoostingClassifier",gbc),("XgboostClassifier",xgbc),("LightGBM",lgbc),("CatBoostingClassifier",cbc)]
#estimators=[("XgboostClassifier",xgbc),("LightGBM",lgbc),("CatBoostingClassifier",cbc)]
estimators=[("XgboostClassifier",xgbc)]
#final_estimator=RandomForestClassifier(random_state=12345,n_jobs=-1)
final_estimator=LogisticRegressionCV(n_jobs=-1)

In [None]:
#scv = StratifiedKFold(n_splits=5)

In [None]:
clf=StackingClassifier(estimators=estimators,final_estimator=final_estimator)

In [None]:
time1=time()
clf.fit(X_train,y_train)
print("Time of Training : {:.3f} s".format(time()-time1))

In [41]:
y_pred_train=clf.predict(X_train)
y_pred_test=clf.predict(X_test)

In [42]:
y_train_en=OneHotEncoder().fit_transform(pd.DataFrame(y_train)).toarray()

In [43]:
y_pred_train_en=OneHotEncoder().fit_transform(pd.DataFrame(y_pred_train)).toarray()

In [44]:
y_train_en,y_pred_train_en

(array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]]),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 1.]]))

In [45]:
y_test_en=OneHotEncoder().fit_transform(pd.DataFrame(y_test)).toarray()

In [46]:
y_pred_test_en=OneHotEncoder().fit_transform(pd.DataFrame(y_pred_test)).toarray()

In [47]:
y_test_en,y_pred_test_en

(array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]),
 array([[1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.],
        ...,
        [1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [1., 0., 0., ..., 0., 0., 0.]]))

In [48]:
#5. Result 
#5.1 Train
print("==Train==")
result=metrics.confusion_matrix(y_train,y_pred_train)
report=metrics.classification_report(y_train,y_pred_train,zero_division=1)

print(result)
print(report)

print("Accuracy Score = ",metrics.balanced_accuracy_score(y_train,y_pred_train))
print("Precison       = ",metrics.precision_score(y_train,y_pred_train,average='macro'))
print("Recall         = ",metrics.recall_score(y_train,y_pred_train,average='macro'))
print("F1_score       = ",metrics.f1_score(y_train,y_pred_train,average='macro'))
print("Auc score      = ",metrics.roc_auc_score(y_train_en,y_pred_train_en,average='macro'))

==Train==
[[1387085     664       0     238       0       0     627       1]
 [     82 1388533       0       0       0       0       0       0]
 [      0       0 1388615       0       0       0       0       0]
 [     85       0       0 1388487       0       0      35       8]
 [      0       0       0       0 1388615       0       0       0]
 [      0       0       0       0       0 1388615       0       0]
 [      1       0       0      27       0       0 1388573      14]
 [      0       0       0       1       0       0       3 1388611]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1388615
           1       1.00      1.00      1.00   1388615
           2       1.00      1.00      1.00   1388615
           3       1.00      1.00      1.00   1388615
           4       1.00      1.00      1.00   1388615
           5       1.00      1.00      1.00   1388615
           6       1.00      1.00      1.00   1388615
           7       1

In [49]:
precision,recall,fscore,none= metrics.precision_recall_fscore_support(y_train, y_pred_train, average='weighted') 
print("Accuracy Score     : "+(str(metrics.accuracy_score(y_train,y_pred_train))))
print('Weighted Precision : '+(str(precision)))
print('Weighted Recall    : '+(str(recall)))
print('Weighted F1-score  : '+(str(fscore)))

Accuracy Score     : 0.99983922829582
Weighted Precision : 0.9998392677419171
Weighted Recall    : 0.99983922829582
Weighted F1-score  : 0.9998392058583113


In [50]:
#5.2 Test
print("==Test==")
result=metrics.confusion_matrix(y_test,y_pred_test)
report=metrics.classification_report(y_test,y_pred_test,zero_division=1)

print(result)
print(report)

print("Accuracy Score = ",metrics.balanced_accuracy_score(y_test,y_pred_test))
print("Precison       = ",metrics.precision_score(y_test,y_pred_test,average='macro'))
print("Recall         = ",metrics.recall_score(y_test,y_pred_test,average='macro'))
print("F1_score       = ",metrics.f1_score(y_test,y_pred_test,average='macro'))
print("Auc score      = ",metrics.roc_auc_score(y_test_en,y_pred_test_en,average='macro'))

==Test==
[[682936    424      1    145      0      2    351      2]
 [    10    645      0      0      0      0      0      0]
 [     1      0   3048      0      0      0      0      1]
 [    14      0      0 106267      0      0     10      1]
 [     0      0      0      0      1      0      0      0]
 [     1      0      0      0      0     15      0      0]
 [     3      0      0      4      0      0  29825      2]
 [     1      0      0      1      0      0      1    700]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    683861
           1       0.60      0.98      0.75       655
           2       1.00      1.00      1.00      3050
           3       1.00      1.00      1.00    106292
           4       1.00      1.00      1.00         1
           5       0.88      0.94      0.91        16
           6       0.99      1.00      0.99     29834
           7       0.99      1.00      0.99       703

    accuracy                  

In [51]:
tprecision,trecall,tfscore,none= metrics.precision_recall_fscore_support(y_test, y_pred_test, average='weighted')
print("Accuracy Score     : "+(str(metrics.accuracy_score(y_test,y_pred_test))))
print('Weighted Precision : '+(str(tprecision)))
print('Weighted Recall    : '+(str(trecall)))
print('Weighted F1-score  : '+(str(tfscore)))

Accuracy Score     : 0.9988173389033639
Weighted Precision : 0.9990219922092372
Weighted Recall    : 0.9988173389033639
Weighted F1-score  : 0.9988816207522491


### Xgboost + LogisticRegression (Original Data)
Accuracy Score =  0.9494163118894287  
Precison       =  0.9817017338340577  
Recall         =  0.9494163118894287  
F1_score       =  0.9641099573831886  
Auc score      =  0.9745784207103945  

### Xgboost + Logistic Regression ( SMOTE)  
Accuracy Score =  0.9894275221209294  
Precison       =  0.9329310777743772  
Recall         =  0.9894275221209294  
F1_score       =  0.9553460985510285  
Auc score      =  0.994626038893581  