In [18]:
from audioop import avg
import cmath
from fileinput import filename
from gc import enable
import sys
from matplotlib.pyplot import axis
import pandas as pd
from sklearn.model_selection import KFold
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, roc_auc_score, auc 
import warnings
import pickle
import tensorflow as tf
from fastprogress import fastprogress
warnings.filterwarnings("ignore")
from fastai.tabular.all import *
import os
os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

torch.backends.cudnn.benchmark = True
fastprogress.MAX_COLS = 80

#setting GPU Mem
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)
# #############################################

if torch.cuda.is_available():
    print("GPU enabling...")
    torch.cuda.device('cuda')
else:
	print("No GPU")



dataPath = ('../dataset/KDD-2018/')
modelPath = '../KDD/'
#fileName = 'data_balanced.csv'
df_train = pd.read_csv('../dataset/KDD-2018/train.csv')
df_test = pd.read_csv('../dataset/KDD-2018/test.csv')

labels = ['Probe', 'Dos', 'normal', 'R2L', 'U2R']

cat_names = ['service', 'protocol_type']
y_names = 'Label'
cont_names = ['duration', 'flag', 'src_bytes',
       'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
       'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
       'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
       'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate']

procs = [Categorify, FillMissing, Normalize]
y_block = CategoryBlock()

verbose = 1
dep_var = 'Label'
params = {#'gpu_id':0,
    'n_estimators':50, #train_ori
    #'n_estimators':100, #train balanced
    'tree_method':'exact',    
    #'tree_method':'gpu_hist',
    'max_depth':5,
    # 'enable_categorical':True,
    'objective':"multi:softmax", 
    'booster':"gbtree",     
    'learning_rate':0.2,  #train ori 
    #'learning_rate':0.2, #train balanced
    # 'silent':0, 
    # 'single_precision_histogram': True,
    'eval_metric':"mlogloss"    
}
print('XGBoost Training model...')
model = xgb.XGBClassifier(**params)
xgb_model = model.fit(df_train[df_train.columns[:-1]], df_train[y_names])
print('XGBoost Predicting...')
start = time.time()
xgb_preds = xgb_model.predict_proba(df_test[df_test.columns[:-1]])
elapsed_xgb = time.time() - start
print('XGBoost Finish...')

print('DNN Training model...')
acc1 = 0.8
acc3 = 0.1
step = 0
while acc3 < 0.5555:
    step = step + 1
    print('Training model...', step )
    print('Setting model...' )
        # create model
    dls = TabularDataLoaders.from_df(df_train, path=dataPath, cat_names=cat_names, cont_names=cont_names, procs=procs, y_names=y_names, bs=64 ) #, valid_idx=list(range(1,test.shape[0])))
    roc_auc = RocAuc(average='weighted')
    learn = tabular_learner(dls, layers=[400,200], metrics=[accuracy, Precision(average='weighted'), F1Score(average='weighted'), Recall(average='weighted'), roc_auc])
    learn.fit(1, 1e-2)
    dl = learn.dls.test_dl(df_test, with_labels=True, drop_last=False)
    
    print("DNN Predicting...")
    start = time.time()
    nn_preds, tests, clas_idx = learn.get_preds(dl=dl, with_loss=False, with_decoded=True)
    elapsed = time.time() - start
    
    print('Printing result of DNN')
    loss, acc, precision, f1, recall, roc = learn.validate(dl=dl)
    print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; roc-auc: {:.2f}%; elapsed: {:.2f} s'.format(acc*100, precision*100, f1*100, recall*100, roc*100,  elapsed ))
    
    acc1 = accuracy(tensor(xgb_preds), tensor(tests))
    print('Accuracy of XGBoost: {:.2f}%' .format(acc1*100,))
    
    acc2 = accuracy(tensor(nn_preds), tensor(tests))
    print('Accuracy of DNN: {:.2f}%' .format(acc2*100,))
    
    start = time.time()
    avgs = (nn_preds + xgb_preds) / 2
    elapsed_ensemble = time.time() - start
    
    argmax = avgs.argmax(dim=1)    
    acc3 = accuracy(tensor(avgs), tensor(tests))
    print('Accuracy of Ensemble: {:.2f}%' .format(acc3*100,))

print("-----FINAL------")
print("XGBboost_Elapsed: ", elapsed_xgb)
print('DNN_Elapsed: ', elapsed)
print('Ensemble_Elapsed: ', elapsed_ensemble)
print("------>")
print('Printing XGBoost result')
precision1 = precision_score(tests, xgb_preds.argmax(axis=1), average='weighted')
f11 = f1_score(tests, xgb_preds.argmax(axis=1), average='weighted')
recall1 = recall_score(tests, xgb_preds.argmax(axis=1), average='weighted')
print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; elapsed: {:.2f} s'.format(acc1*100, precision1*100, f11*100, recall1*100,  elapsed_xgb ))
print(classification_report(xgb_preds.argmax(axis=1),tests))
cm = confusion_matrix(tests,np.argmax(xgb_preds, axis=1))
print(cm)
print("------>")
print('Printing result of Ensemble learning')
accuracy3 = accuracy_score(tests, avgs.argmax(axis=1))
precision3 = precision_score(tests, avgs.argmax(axis=1), average='weighted')
f13 = f1_score(tests, avgs.argmax(axis=1), average='weighted')
recall3 = recall_score(tests, avgs.argmax(axis=1), average='weighted')
print('Accuracy: {:.2f}%; precision: {:.2f}%; F1: {:.2f}%; Recall: {:.2f}%; elapsed: {:.2f} s'.format(accuracy3*100, precision3*100, f13*100, recall3*100,  elapsed_ensemble ))
print("Elapsed_Ensemble: ", elapsed_ensemble)
print(classification_report(avgs.argmax(axis=1), tests))
cm = confusion_matrix(tests,avgs.argmax(axis=1))
print(cm)

No GPU
XGBoost Training model...
XGBoost Predicting...
XGBoost Finish...
DNN Training model...
Training model... 1
Setting model...


epoch,train_loss,valid_loss,accuracy,precision_score,f1_score,recall_score,roc_auc_score,time
0,0.044141,0.030755,0.993512,0.993522,0.993509,0.993512,0.999491,01:19


DNN Predicting...


Printing result of DNN


Accuracy: 98.55%; precision: 98.66%; F1: 98.59%; Recall: 98.55%; roc-auc: 99.81%; elapsed: 3.65 s
Accuracy of XGBoost: 99.81%
Accuracy of DNN: 98.55%
Accuracy of Ensemble: 99.52%
-----FINAL------
XGBboost_Elapsed:  0.028514623641967773
DNN_Elapsed:  3.6484274864196777
Ensemble_Elapsed:  0.002166271209716797
------>
Printing XGBoost result
Accuracy: 99.81%; precision: 99.80%; F1: 99.81%; Recall: 99.81%; elapsed: 0.03 s
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5946
           1       0.99      1.00      1.00      2489
           2       0.97      0.98      0.98       189
           3       0.58      0.70      0.64        10
           4       1.00      1.00      1.00      5996

    accuracy                           1.00     14630
   macro avg       0.91      0.94      0.92     14630
weighted avg       1.00      1.00      1.00     14630

[[5946    0    0    0    0]
 [   0 2487    0    1   12]
 [   0    0  186    1    4]
 [   

In [1]:
import pandas as pd

#fileName = 'data_balanced.csv'
df_train = pd.read_csv('../dataset/KDD-2018/train.csv')
df_test = pd.read_csv('../dataset/KDD-2018/test.csv')
print('Train balanced dataset:')
print(df_train['Label'].value_counts())

print('Test dataset:')
print(df_test['Label'].value_counts())

  from pandas import MultiIndex, Int64Index


Train balanced dataset:
Dos       14054
U2R       14019
normal    14019
Probe     14019
R2L       14019
Name: Label, dtype: int64
Test dataset:
normal    5981
Dos       5946
Probe     2500
R2L        191
U2R         12
Name: Label, dtype: int64
