In [11]:
from audioop import avg
import cmath
from fileinput import filename
from gc import enable
import sys
from matplotlib.pyplot import axis
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score, auc 
import warnings
import pickle
import tensorflow as tf
from fastprogress import fastprogress
warnings.filterwarnings("ignore")
from fastai.tabular.all import *
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

torch.backends.cudnn.benchmark = True
fastprogress.MAX_COLS = 80


#setting GPU Mem
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
# #############################################

if torch.cuda.is_available():
    print("GPU enabling...")
    torch.cuda.device('cuda')
else:
	print("No GPU")
    
dataPath = ('../dataset/CIC-2018/')
modelPath = '../CIC/'
#fileName = 'data_balanced.csv'
df_train = pd.read_csv('../dataset/CIC-2018/train_ori.csv')
df_test = pd.read_csv('../dataset/CIC-2018/test.csv')

labels = ['Benign', 'Bot', 'DoS attacks-GoldenEye', 'Infilteration', 
          'DoS attacks-SlowHTTPTest','DoS attacks-Slowloris',
          'DDOS attack-LOIC-UDP']

cat_names = ['Dst Port', 'Protocol']
y_names = 'Label'
cont_names = ['Flow Duration', 'Tot Fwd Pkts',
              'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
              'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
              'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
              'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
              'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
              'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
              'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
              'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
              'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
              'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
              'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
              'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
              'CWE Flag Count', 'ECE Flag Cnt', 'Down/Up Ratio', 'Pkt Size Avg',
              'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg',
              'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg',
              'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Subflow Fwd Pkts',
              'Subflow Fwd Byts', 'Subflow Bwd Pkts', 'Subflow Bwd Byts',
              'Init Fwd Win Byts', 'Init Bwd Win Byts', 'Fwd Act Data Pkts',
              'Fwd Seg Size Min', 'Active Mean', 'Active Std', 'Active Max',
              'Active Min', 'Idle Mean', 'Idle Std', 'Idle Max', 'Idle Min']

procs = [Categorify, FillMissing, Normalize]
y_block = CategoryBlock()

verbose = 2
dep_var = 'Label'
params = {#'gpu_id':0,
    'n_estimators':100,
    'tree_method':'exact',    
    #'tree_method':'gpu_hist',
    'max_depth':5,
    # 'enable_categorical':True,
    'objective':"multi:softmax", 
    'booster':"gbtree",     
    'learning_rate':0.04, #original    
    #'learning_rate':0.2, #balanced
    # 'silent':0, 
    # 'single_precision_histogram': True,
    'eval_metric':"mlogloss"    
}
print('XGBoost Training model...')
model = xgb.XGBClassifier(**params)
xgb_model = model.fit(df_train[df_train.columns[:-1]], df_train[y_names])
print('XGBoost Predicting...')
start = time.time()
xgb_preds = xgb_model.predict_proba(df_test[df_test.columns[:-1]])
elapsed_xgb = time.time() - start
print('XGBoost Finish...')

print('DNN Training model...')
acc1 = 0.8
acc3 = 0.1
step = 0
while acc3 < 0.5555:
    step = step + 1
    print('Training model...', step )
    print('Setting model...' )
        # create model
    dls = TabularDataLoaders.from_df(df_train, path=dataPath, cat_names=cat_names, cont_names=cont_names, procs=procs, y_names=y_names, bs=64 ) #, valid_idx=list(range(1,test.shape[0])))
    roc_auc = RocAuc(average='weighted')
    learn = tabular_learner(dls, layers=[400,200], metrics=[accuracy, Precision(average='weighted'), F1Score(average='weighted'), Recall(average='weighted'), roc_auc])
    learn.fit(1, 1e-2)
    dl = learn.dls.test_dl(df_test, with_labels=True, drop_last=False)
    
    print("DNN Predicting...")
    start = time.time()
    nn_preds, tests, clas_idx = learn.get_preds(dl=dl, with_loss=False, with_decoded=True)
    elapsed = time.time() - start
    
    print('Printing result of DNN')
    loss, acc, precision, f1, recall, roc = learn.validate(dl=dl)
    print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; roc-auc: {:.2f}%; elapsed: {:.2f} s'.format(acc*100, precision*100, f1*100, recall*100, roc*100,  elapsed ))
    
    acc1 = accuracy(tensor(xgb_preds), tensor(tests))
    print('Accuracy of XGBoost: {:.2f}%' .format(acc1*100,))
    
    acc2 = accuracy(tensor(nn_preds), tensor(tests))
    print('Accuracy of DNN: {:.2f}%' .format(acc2*100,))
    
    start = time.time()
    avgs = (nn_preds + xgb_preds) / 2
    elapsed_ensemble = time.time() - start
    
    argmax = avgs.argmax(dim=1)    
    acc3 = accuracy(tensor(avgs), tensor(tests))
    print('Accuracy of Ensemble: {:.2f}%' .format(acc3*100,))

print("-----FINAL------")
print("XGBboost_Elapsed: ", elapsed_xgb)
print('DNN_Elapsed: ', elapsed)
print('Ensemble_Elapsed: ', elapsed_ensemble)
print("------>")
print('Printing XGBoost result')
precision1 = precision_score(tests, xgb_preds.argmax(axis=1), average='weighted')
f11 = f1_score(tests, xgb_preds.argmax(axis=1), average='weighted')
recall1 = recall_score(tests, xgb_preds.argmax(axis=1), average='weighted')
print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; elapsed: {:.2f} s'.format(acc1*100, precision1*100, f11*100, recall1*100,  elapsed_xgb ))
print(classification_report(xgb_preds.argmax(axis=1),tests))
cm = confusion_matrix(tests,np.argmax(xgb_preds, axis=1))
print(cm)
print("------>")
print('Printing result of Ensemble learning')
accuracy3 = accuracy_score(tests, avgs.argmax(axis=1))
precision3 = precision_score(tests, avgs.argmax(axis=1), average='weighted')
f13 = f1_score(tests, avgs.argmax(axis=1), average='weighted')
recall3 = recall_score(tests, avgs.argmax(axis=1), average='weighted')
print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; elapsed: {:.2f} s'.format(accuracy3*100, precision3*100, f13*100, recall3*100,  elapsed_ensemble ))
print("Elapsed_Ensemble: ", elapsed_ensemble)
print(classification_report(avgs.argmax(axis=1), tests))
cm = confusion_matrix(tests,avgs.argmax(axis=1))
print(cm)

No GPU
XGBoost Training model...
XGBoost Predicting...
XGBoost Finish...
DNN Training model...
Training model... 1
Setting model...


epoch,train_loss,valid_loss,accuracy,precision_score,f1_score,recall_score,roc_auc_score,time
0,0.114741,3.352518,0.945187,0.94758,0.944979,0.945187,0.992861,08:02


DNN Predicting...


Printing result of DNN


Accuracy: 94.45%; precision: 94.60%; F1: 94.42%; Recall: 94.45%; roc-auc: 99.25%; elapsed: 25.74 s
Accuracy of XGBoost: 99.93%
Accuracy of DNN: 94.45%
Accuracy of Ensemble: 99.29%
-----FINAL------
XGBboost_Elapsed:  0.3983166217803955
DNN_Elapsed:  25.74487066268921
Ensemble_Elapsed:  0.011131763458251953
------>
Printing XGBoost result
Accuracy: 99.93%; precision: 99.93%; F1: 99.93%; Recall: 99.93%; elapsed: 0.40 s
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5984
           1       1.00      1.00      1.00      5916
           2       1.00      0.95      0.97        82
           3       1.00      1.00      1.00        29
           4       1.00      1.00      1.00      6053
           5       1.00      1.00      1.00       336
           6       1.00      1.00      1.00      5954
           7       1.00      1.00      1.00      6031
           8       1.00      1.00      1.00      4082
           9       1.00      1.00      

In [None]:
df_train = pd.read_csv('../dataset/CIC-2018/train_full.csv')

print('CIC-2018 Train original dataset:')
print(df_train['Label'].value_counts())

print('CIC-2018 Test dataset:')
print(df_test['Label'].value_counts())