In [None]:
import numpy as np
import csv
import os
import glob
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, classification_report
from tabulate import tabulate
from google.colab import files

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv('/content/drive/My Drive/combined_csv2.csv', delimiter=',', skipinitialspace=True)

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df = pd.DataFrame(data)
df.columns = list(map(str.lower, data.columns.astype(str)))
target_col = 'label'
print(df.columns)

Index(['unnamed: 0', 'flow id', 'source ip', 'source port', 'destination ip',
       'destination port', 'protocol', 'timestamp', 'flow duration',
       'total fwd packets', 'total backward packets',
       'total length of fwd packets', 'total length of bwd packets',
       'fwd packet length max', 'fwd packet length min',
       'fwd packet length mean', 'fwd packet length std',
       'bwd packet length max', 'bwd packet length min',
       'bwd packet length mean', 'bwd packet length std', 'flow bytes/s',
       'flow packets/s', 'flow iat mean', 'flow iat std', 'flow iat max',
       'flow iat min', 'fwd iat total', 'fwd iat mean', 'fwd iat std',
       'fwd iat max', 'fwd iat min', 'bwd iat total', 'bwd iat mean',
       'bwd iat std', 'bwd iat max', 'bwd iat min', 'fwd psh flags',
       'bwd psh flags', 'fwd urg flags', 'bwd urg flags', 'fwd header length',
       'bwd header length', 'fwd packets/s', 'bwd packets/s',
       'min packet length', 'max packet length', 'packet le

In [None]:
print(df.shape)
non_floats = []

#cleaning data from columns which have a non int/float type
for col in df:
    if df[col].dtypes != "float64" and df[col].dtypes != "int64" and col != target_col:
        non_floats.append(col)
    elif df[col].dtypes == "int64":
        df[col] = df[col].astype(float)
df = df.drop(columns=non_floats)
df.info()
print(df.shape)

(59990, 88)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59990 entries, 0 to 59989
Data columns (total 81 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   unnamed: 0                   59990 non-null  float64
 1   source port                  59990 non-null  float64
 2   destination port             59990 non-null  float64
 3   protocol                     59990 non-null  float64
 4   flow duration                59990 non-null  float64
 5   total fwd packets            59990 non-null  float64
 6   total backward packets       59990 non-null  float64
 7   total length of fwd packets  59990 non-null  float64
 8   total length of bwd packets  59990 non-null  float64
 9   fwd packet length max        59990 non-null  float64
 10  fwd packet length min        59990 non-null  float64
 11  fwd packet length mean       59990 non-null  float64
 12  fwd packet length std        59990 non-null  float64
 13  bwd 

In [None]:
a = df[target_col].unique()
print(a)

['DrDoS_DNS' 'BENIGN' 'DrDoS_LDAP' 'DrDoS_MSSQL' 'DrDoS_NetBIOS'
 'DrDoS_NTP' 'DrDoS_SNMP']


In [None]:
#cleaning data from nan and infinite values
df =df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]

#assigning numerical values to our target column (multiclass classification)
df.loc[df[target_col] == 'BENIGN', target_col] = 0
df.loc[df[target_col] == 'DrDoS_DNS', target_col] = 1
df.loc[df[target_col] == 'DrDoS_LDAP', target_col] = 2
df.loc[df[target_col] == 'DrDoS_MSSQL', target_col] = 3
df.loc[df[target_col] == 'DrDoS_NetBIOS', target_col] = 4
df.loc[df[target_col] == 'DrDoS_NTP', target_col] = 5
df.loc[df[target_col] == 'DrDoS_SNMP', target_col] = 6
print(df.head())

   unnamed: 0  source port  destination port  ...  idle min  inbound  label
0       425.0        634.0           60495.0  ...       0.0      1.0      1
1       430.0        634.0           60495.0  ...       0.0      0.0      1
2      1654.0        634.0           46391.0  ...       0.0      1.0      1
3      2927.0        634.0           11894.0  ...       0.0      1.0      1
4       694.0        634.0           27878.0  ...       0.0      1.0      1

[5 rows x 81 columns]


In [None]:
X = df.iloc[:, 0:80]
y = df.iloc[:, 80]

#  Feature Selection

### Feature selection using select k

In [None]:
selector = SelectKBest(f_classif, k = 40)
X_new = selector.fit_transform(X, y)

names = X.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiKbest.xlsx')

  f = msb / msw


                     Feat_names      F_Scores
33         avg fwd segment size  88255.924949
8        fwd packet length mean  88255.924949
23            min packet length  85041.967924
7         fwd packet length min  84672.433032
32          average packet size  73849.406425
25           packet length mean  70927.227555
6         fwd packet length max  43467.815425
1                   source port  24050.149376
13                 flow bytes/s  23602.284422
39                      inbound  22047.667954
35            subflow fwd bytes  20207.309944
5   total length of fwd packets  20207.309944
38             act_data_pkt_fwd  19892.764067
3                      protocol  16227.677657
0                    unnamed: 0  15297.163455
22                fwd packets/s  14274.539267
14               flow packets/s  14053.508216
31                down/up ratio   8104.784129
29               urg flag count   6173.076123
30               cwe flag count   4658.683379
24            max packet length   

### **Feature Selection using RFE and Random forest as it's method**

In [None]:
clf = RandomForestClassifier()
rfe = RFE(clf, n_features_to_select=40)
y = y.astype('int')
rfe.fit(X, y)

RFE(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                     class_weight=None, criterion='gini',
                                     max_depth=None, max_features='auto',
                                     max_leaf_nodes=None, max_samples=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=100, n_jobs=None,
                                     oob_score=False, random_state=None,
                                     verbose=0, warm_start=False),
    n_features_to_select=40, step=1, verbose=0)

In [None]:
print(rfe.support_)
print(rfe.ranking_)

[ True  True  True  True  True  True  True  True False  True  True  True
 False False False False False  True  True  True  True  True  True  True
  True  True  True  True  True False False False False False False False
 False  True  True  True  True  True  True  True False False False False
 False False False False False False  True  True False False False False
 False False False  True  True  True False  True  True  True  True False
 False False False False False False False  True]
[ 1  1  1  1  1  1  1  1 17  1  1  1 13 11  8 21 20  1  1  1  1  1  1  1
  1  1  1  1  1  6  7  4  9 19 32 39 33  1  1  1  1  1  1  1  2 31 24 23
 30 10  3 16 40  5  1  1 14 38 34 41 35 37 36  1  1  1 12  1  1  1  1 18
 29 15 22 28 27 25 26  1]


In [None]:
names = X.columns.values[rfe.get_support()]
scores = rfe.ranking_[rfe.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiRFE.xlsx')

                     Feat_names  F_Scores
37             act_data_pkt_fwd         1
30          average packet size         1
31         avg fwd segment size         1
23            bwd header length         1
22                bwd iat total         1
25                bwd packets/s         1
2              destination port         1
11                 flow bytes/s         1
4                 flow duration         1
15                 flow iat max         1
13                flow iat mean         1
16                 flow iat min         1
14                 flow iat std         1
12               flow packets/s         1
20                  fwd iat max         1
18                 fwd iat mean         1
21                  fwd iat min         1
19                  fwd iat std         1
17                fwd iat total         1
8         fwd packet length max         1
10       fwd packet length mean         1
9         fwd packet length min         1
24                fwd packets/s   

In [None]:
names = X.columns.values[rfe.get_support()]
scores = rfe.estimator_.feature_importances_
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiRFEscore.xlsx')

                     Feat_names  F_Scores
1                   source port  0.155529
0                    unnamed: 0  0.123280
31         avg fwd segment size  0.075087
26            min packet length  0.056841
10       fwd packet length mean  0.051492
9         fwd packet length min  0.050787
8         fwd packet length max  0.042727
30          average packet size  0.041894
28           packet length mean  0.038435
38         min_seg_size_forward  0.034779
27            max packet length  0.034671
2              destination port  0.029060
11                 flow bytes/s  0.027933
33            subflow fwd bytes  0.027761
7   total length of fwd packets  0.024003
4                 flow duration  0.015484
24                fwd packets/s  0.014224
14                 flow iat std  0.013881
39                      inbound  0.012040
19                  fwd iat std  0.011171
32          subflow fwd packets  0.010721
15                 flow iat max  0.010661
12               flow packets/s  0

In [None]:
pd.DataFrame(rfe.support_,index=X.columns,columns=['important'])

Unnamed: 0,important
unnamed: 0,True
source port,True
destination port,True
protocol,True
flow duration,True
...,...
idle mean,False
idle std,False
idle max,False
idle min,False


### **Feature Selection using Random forest**

In [None]:
RF_model = RandomForestClassifier()
RF_model.fit(X,y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
names = X.columns.values[rfe.get_support()]
scores = RF_model.feature_importances_
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiRF.xlsx')

                     Feat_names  F_Scores
1                   source port  0.138221
0                    unnamed: 0  0.110655
10       fwd packet length mean  0.043973
7   total length of fwd packets  0.041650
11                 flow bytes/s  0.036292
9         fwd packet length min  0.031901
2              destination port  0.030950
17                fwd iat total  0.026078
4                 flow duration  0.025687
21                  fwd iat min  0.023253
18                 fwd iat mean  0.014772
38         min_seg_size_forward  0.014216
25                bwd packets/s  0.013722
20                  fwd iat max  0.009461
19                  fwd iat std  0.008928
5             total fwd packets  0.008901
24                fwd packets/s  0.006678
6        total backward packets  0.006051
23            bwd header length  0.005349
29            packet length std  0.004931
39                      inbound  0.004733
26            min packet length  0.004210
37             act_data_pkt_fwd  0

### **Feature selection using RFE with Decision tree as estimator**

In [None]:
DT_clf = DecisionTreeClassifier()
rfe = RFE(DT_clf, n_features_to_select=40)
y = y.astype('int')
rfe.fit(X, y)

RFE(estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                     criterion='gini', max_depth=None,
                                     max_features=None, max_leaf_nodes=None,
                                     min_impurity_decrease=0.0,
                                     min_impurity_split=None,
                                     min_samples_leaf=1, min_samples_split=2,
                                     min_weight_fraction_leaf=0.0,
                                     presort='deprecated', random_state=None,
                                     splitter='best'),
    n_features_to_select=40, step=1, verbose=0)

In [None]:
names = X.columns.values[rfe.get_support()]
scores = rfe.ranking_[rfe.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiRFE_DT.xlsx')

                     Feat_names  F_Scores
30               ack flag count         1
39                   active min         1
32          average packet size         1
33         avg fwd segment size         1
22                  bwd iat max         1
23                  bwd iat min         1
21                  bwd iat std         1
20                bwd iat total         1
9         bwd packet length min         1
25                bwd packets/s         1
2              destination port         1
31                down/up ratio         1
10                 flow bytes/s         1
3                 flow duration         1
13                 flow iat max         1
12                flow iat mean         1
14                 flow iat min         1
11               flow packets/s         1
18                  fwd iat max         1
16                 fwd iat mean         1
19                  fwd iat min         1
17                  fwd iat std         1
15                fwd iat total   

In [None]:
names = X.columns.values[rfe.get_support()]
scores = rfe.estimator_.feature_importances_
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)
ns_df_sorted.to_excel('multiRFE_DT_score.xlsx')

                     Feat_names  F_Scores
1                   source port  0.285845
8        fwd packet length mean  0.173971
7         fwd packet length min  0.172540
32          average packet size  0.144859
0                    unnamed: 0  0.111438
38         min_seg_size_forward  0.040632
9         bwd packet length min  0.019751
2              destination port  0.019667
37      init_win_bytes_backward  0.011144
10                 flow bytes/s  0.002832
22                  bwd iat max  0.002114
25                bwd packets/s  0.001482
31                down/up ratio  0.001402
34            subflow fwd bytes  0.001294
39                   active min  0.001248
35          subflow bwd packets  0.001218
12                flow iat mean  0.000751
5   total length of fwd packets  0.000735
24                fwd packets/s  0.000697
3                 flow duration  0.000624
14                 flow iat min  0.000562
15                fwd iat total  0.000559
27            max packet length  0

# **K-FOLD CROSS VALIDATION**

In [None]:
#features kept by the Random forest features selection
colums_to_keep = ['flow packets/s', 'flow iat mean', 'flow bytes/s', 'fwd packet length min', 'init_win_bytes_backward', 'fwd iat std', 'flow duration',
 'inbound', 'average packet size', 'destination port', 'fwd packet length mean', 'min_seg_size_forward', 'fwd iat max',
 'packet length std', 'bwd iat max', 'bwd packet length max', 'packet length variance', 'total backward packets',
 'bwd iat mean','fwd iat total', 'bwd iat total', 'total length of fwd packets', 'flow iat max', 'max packet length', 
 'flow iat std', 'fwd packets/s', 'fwd packet length max', 'bwd packets/s', 'bwd header length', 'packet length mean',
 'flow iat min', 'urg flag count', 'fwd iat mean', 'min packet length', 'source port', 'avg fwd segment size', 'subflow fwd packets',
 'init_win_bytes_forward', 'subflow bwd packets', 'subflow fwd bytes', 'label']

df_filtered = df.reindex(columns = colums_to_keep)

df_filtered['label'] = df_filtered['label'].astype(int)
print(df_filtered.shape)
print(df_filtered.dtypes)

df_filtered.head()

(59304, 41)
flow packets/s                 float64
flow iat mean                  float64
flow bytes/s                   float64
fwd packet length min          float64
init_win_bytes_backward        float64
fwd iat std                    float64
flow duration                  float64
inbound                        float64
average packet size            float64
destination port               float64
fwd packet length mean         float64
min_seg_size_forward           float64
fwd iat max                    float64
packet length std              float64
bwd iat max                    float64
bwd packet length max          float64
packet length variance         float64
total backward packets         float64
bwd iat mean                   float64
fwd iat total                  float64
bwd iat total                  float64
total length of fwd packets    float64
flow iat max                   float64
max packet length              float64
flow iat std                   float64
fwd packets/s

Unnamed: 0,flow packets/s,flow iat mean,flow bytes/s,fwd packet length min,init_win_bytes_backward,fwd iat std,flow duration,inbound,average packet size,destination port,fwd packet length mean,min_seg_size_forward,fwd iat max,packet length std,bwd iat max,bwd packet length max,packet length variance,total backward packets,bwd iat mean,fwd iat total,bwd iat total,total length of fwd packets,flow iat max,max packet length,flow iat std,fwd packets/s,fwd packet length max,bwd packets/s,bwd header length,packet length mean,flow iat min,urg flag count,fwd iat mean,min packet length,source port,avg fwd segment size,subflow fwd packets,init_win_bytes_forward,subflow bwd packets,subflow fwd bytes,label
0,3413.689952,295.989583,1502024.0,440.0,-1.0,500.959301,28415.0,1.0,444.536082,60495.0,440.0,-1.0,3596.0,0.0,0.0,0.0,0.0,0.0,0.0,28415.0,0.0,42680.0,3596.0,440.0,500.959301,3413.689952,440.0,0.0,0.0,440.0,1.0,0.0,295.989583,440.0,634.0,440.0,97.0,-1.0,0.0,42680.0,1
1,1000000.0,2.0,440000000.0,440.0,-1.0,0.0,2.0,0.0,660.0,60495.0,440.0,-1.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,880.0,2.0,440.0,0.0,1000000.0,440.0,0.0,0.0,440.0,2.0,0.0,2.0,440.0,634.0,440.0,2.0,-1.0,0.0,880.0,1
2,4119.549321,243.964824,1812602.0,440.0,-1.0,578.101371,48549.0,1.0,442.2,46391.0,440.0,-1.0,5418.0,0.0,0.0,0.0,0.0,0.0,0.0,48549.0,0.0,88000.0,5418.0,440.0,578.101371,4119.549321,440.0,0.0,0.0,440.0,1.0,0.0,243.964824,440.0,634.0,440.0,200.0,-1.0,0.0,88000.0,1
3,4137.617146,242.899497,1820552.0,440.0,-1.0,485.292695,48337.0,1.0,442.2,11894.0,440.0,-1.0,3337.0,0.0,0.0,0.0,0.0,0.0,0.0,48337.0,0.0,88000.0,3337.0,440.0,485.292695,4137.617146,440.0,0.0,0.0,440.0,1.0,0.0,242.899497,440.0,634.0,440.0,200.0,-1.0,0.0,88000.0,1
4,6244.925998,160.934673,2747767.0,440.0,-1.0,196.891271,32026.0,1.0,442.2,27878.0,440.0,-1.0,1236.0,0.0,0.0,0.0,0.0,0.0,0.0,32026.0,0.0,88000.0,1236.0,440.0,196.891271,6244.925998,440.0,0.0,0.0,440.0,0.0,0.0,160.934673,440.0,634.0,440.0,200.0,-1.0,0.0,88000.0,1


In [None]:
X = df_filtered.iloc[:, 0:40]
y = df_filtered.iloc[:, 40]

### Decision tree classification 

In [None]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = DecisionTreeClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in kf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 1
Precision :  0.21264405457201518
Recall    :  0.22958785309946736
F-score   :  0.22040390654711262
accuracy :  0.1701374251749431 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 2
Precision :  0.2417400487378371
Recall    :  0.33492748304295483
F-score   :  0.25014780267349196
accuracy :  0.24365567827333276 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.4227524003491417
Recall    :  0.2980240270157963
F-score   :  0.3420182305355023
accuracy :  0.5432931456032375 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 4
Precision :  0.4990752644835516
Recall    :  0.441900000217158
F-score   :  0.46692979854715055
accuracy :  0.9392968552398617 

Results for fold # 5
Precision :  0.49873435088045853
Recall    :  0.46345069714462017
F-score   :  0.47976142034534713
accuracy :  0.15227655986509275 


Avg precision : 0.3749892238046008

Avg recall : 0.35357801210399936

Avg f1 score : 0.3518522317297209

Avg accuracy : 0.40973193283129355



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Random Forest classification

In [None]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = RandomForestClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in kf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 1
Precision :  0.21816001125241907
Recall    :  0.23465045592705167
F-score   :  0.21458831850215138
accuracy :  0.17536464041817723 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 2
Precision :  0.2543006533611068
Recall    :  0.28358336391696765
F-score   :  0.24443779585133688
accuracy :  0.25402579883652304 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.5993636884630786
Recall    :  0.514201475251137
F-score   :  0.5464212145801876
accuracy :  0.7851783154877329 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 4
Precision :  0.4998099007975496
Recall    :  0.46487368807757884
F-score   :  0.48051033117859393
accuracy :  0.9710816963156563 

Results for fold # 5
Precision :  0.49981454005934717
Recall    :  0.4819109461966605
F-score   :  0.49052321252238756
accuracy :  0.15573355817875212 


Avg precision : 0.41428975878670027

Avg recall : 0.3958439858738791

Avg f1 score : 0.39529617452693144

Avg accuracy : 0.4682768018473683



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Naive bayes classifier 

In [None]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = GaussianNB()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in kf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 1
Precision :  0.206415748605949
Recall    :  0.1554299935863473
F-score   :  0.13653732853499784
accuracy :  0.15841834583930528 

Results for fold # 2
Precision :  0.10818857744440058
Recall    :  0.13805625524769102
F-score   :  0.12131105208794453
accuracy :  0.5545063653992075 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.26785714285714285
Recall    :  0.10754169688179839
F-score   :  0.12543815340343786
accuracy :  0.02689486552567237 

Results for fold # 4
Precision :  0.23386356736062042
Recall    :  0.07507552283562488
F-score   :  0.10163839292025391
accuracy :  0.11592614450720849 

Results for fold # 5
Precision :  0.25813547295453304
Recall    :  0.07879669381419302
F-score   :  0.11133691775959119
accuracy :  0.031197301854974706 


Avg precision : 0.2148921018445292

Avg recall : 0.11098003247313093

Avg f1 score : 0.11925236894124507

Avg accuracy : 0.17738860462527367



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### KNN (k-nearest neighbors)

In [None]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = KNeighborsClassifier(n_neighbors = 5)
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in kf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 1
Precision :  0.10184625485766155
Recall    :  0.2369970162572154
F-score   :  0.12805115099918815
accuracy :  0.17991737627518758 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 2
Precision :  0.22834203513178042
Recall    :  0.29225677930595456
F-score   :  0.21911840628505877
accuracy :  0.43225697664615126 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.39319847415461545
Recall    :  0.3167881558066942
F-score   :  0.3237699320389939
accuracy :  0.6082117865272743 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 4
Precision :  0.4068367110675751
Recall    :  0.380199446799247
F-score   :  0.3925846335111088
accuracy :  0.9319618919146784 

Results for fold # 5
Precision :  0.2738035892323031
Recall    :  0.2582015654148739
F-score   :  0.26543840228331145
accuracy :  0.14839797639123103 


Avg precision : 0.28080541288878713

Avg recall : 0.2968885927167971

Avg f1 score : 0.2657925050235322

Avg accuracy : 0.4601492015509045



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Neural network

In [None]:
k = 5
kf = KFold(n_splits=k, random_state=None)
model = MLPClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in kf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 1
Precision :  0.12592047128129602
Recall    :  0.14007208387942333
F-score   :  0.1326198231735691
accuracy :  0.0720849844026642 



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 2
Precision :  0.12353908668730651
Recall    :  0.3166624138810921
F-score   :  0.17771208097409433
accuracy :  0.29752971924795546 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.4037225308840929
Recall    :  0.35221877427611137
F-score   :  0.3745270730240656
accuracy :  0.9324677514543461 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 4
Precision :  0.41303094622392106
Recall    :  0.35011875743613396
F-score   :  0.3747676235903087
accuracy :  0.9066689149312874 

Results for fold # 5
Precision :  0.32411838318410774
Recall    :  0.31576847517032053
F-score   :  0.31985995205735773
accuracy :  0.15261382799325462 


Avg precision : 0.2780662836521448

Avg recall : 0.29496810092861625

Avg f1 score : 0.27589731056387906

Avg accuracy : 0.4722730396059015



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Stratified K-FOLD CROSS VALIDATION 

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
target_N = ['BENIGN', 'DrDoS_DNS', 'DrDoS_LDAP', 'DrDoS_MSSQL', 'DrDoS_NetBIOS', 'DrDoS_NTP', 'DrDoS_SNMP']



### Decision tree classification

In [None]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=0)
model = DecisionTreeClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in skf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))



Results for fold # 1
Precision :  0.8622890945155115
Recall    :  0.841194701021796
F-score   :  0.8162461259697851
accuracy :  0.8307056740578366 

Results for fold # 2
Precision :  0.9136631216442196
Recall    :  0.9008305926080117
F-score   :  0.9062433604359658
accuracy :  0.9061630553916196 

Results for fold # 3
Precision :  0.8712255046667844
Recall    :  0.8608435455627751
F-score   :  0.8600355288835824
accuracy :  0.8500126464884917 

Results for fold # 4
Precision :  0.8820139967376116
Recall    :  0.8833786267492396
F-score   :  0.8816568424423528
accuracy :  0.8813759379478965 

Results for fold # 5
Precision :  0.8901366701999492
Recall    :  0.88381243642677
F-score   :  0.8863819959579297
accuracy :  0.8844856661045531 


Avg precision : 0.8838656775528152

Avg recall : 0.8740119804737185

Avg f1 score : 0.870112770737923

Avg accuracy : 0.8705485959980794



### Random Forest classification

In [None]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=None)
model = RandomForestClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in skf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

Results for fold # 1
Precision :  0.905545508401116
Recall    :  0.8832301713410361
F-score   :  0.861729732086407
accuracy :  0.8694882387657027 

Results for fold # 2
Precision :  0.926505144611727
Recall    :  0.9230307901861053
F-score   :  0.9247072140863787
accuracy :  0.9224348705842678 

Results for fold # 3
Precision :  0.88605888778984
Recall    :  0.8814978064840434
F-score   :  0.8778543662288856
accuracy :  0.8671275609139195 

Results for fold # 4
Precision :  0.9007187161007495
Recall    :  0.9111207532888121
F-score   :  0.9039296247324359
accuracy :  0.9014416996880533 

Results for fold # 5
Precision :  0.8981415626208709
Recall    :  0.902000998303313
F-score   :  0.8991137207340861
accuracy :  0.8947723440134907 


Avg precision : 0.9033939639048606

Avg recall : 0.900176103920662

Avg f1 score : 0.8934669315736385

Avg accuracy : 0.8910529427930868



### Naive bayes classifier

In [None]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=None)
model = GaussianNB()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in skf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

Results for fold # 1
Precision :  0.40205503262329945
Recall    :  0.3469166889696577
F-score   :  0.24476114544307107
accuracy :  0.35983475255037517 

Results for fold # 2
Precision :  0.45478535321606234
Recall    :  0.34541103111408666
F-score   :  0.24479332687009595
accuracy :  0.3570525250822022 

Results for fold # 3
Precision :  0.36115854711159023
Recall    :  0.34315413604711625
F-score   :  0.23536614641836437
accuracy :  0.3562094258494225 

Results for fold # 4
Precision :  0.32672790862021944
Recall    :  0.3415927018594595
F-score   :  0.23437250821841996
accuracy :  0.3542702976140292 

Results for fold # 5
Precision :  0.31827717068980954
Recall    :  0.34186457869654713
F-score   :  0.23389301948378627
accuracy :  0.3526981450252951 


Avg precision : 0.37260080245219623

Avg recall : 0.34378782733737345

Avg f1 score : 0.2386372292867475

Avg accuracy : 0.3560130292242648



### KNN (k-nearest neighbors)

In [None]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=None)
model = KNeighborsClassifier(n_neighbors = 5)
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in skf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))

Results for fold # 1
Precision :  0.8450808372984209
Recall    :  0.8538414940860788
F-score   :  0.8465080412115397
accuracy :  0.8457128404013152 

Results for fold # 2
Precision :  0.8579891389570155
Recall    :  0.8540745388471446
F-score   :  0.8543047584410777
accuracy :  0.8535536632661664 

Results for fold # 3
Precision :  0.8608343235887956
Recall    :  0.8611033252910522
F-score   :  0.8581575659360073
accuracy :  0.8572632998903971 

Results for fold # 4
Precision :  0.8382806726888962
Recall    :  0.845449169776354
F-score   :  0.8406150576361323
accuracy :  0.8343310007587893 

Results for fold # 5
Precision :  0.8292776278727433
Recall    :  0.8315363957454778
F-score   :  0.8277341537968762
accuracy :  0.8264755480607082 


Avg precision : 0.8462925200811743

Avg recall : 0.8492009847492215

Avg f1 score : 0.8454639154043267

Avg accuracy : 0.8434672704754753



### Neural network

In [None]:
k = 5
skf = StratifiedKFold(n_splits=k, random_state=None)
model = MLPClassifier()
acc_score = []
f1_sc = []
precision_sc = []
recall_sc = []
 
n = 1
for train_index , test_index in skf.split(X, y):
  #print("TRAIN:", train_index, "TEST:", test_index)

  X_train , X_test = X.iloc[train_index,0:40],X.iloc[test_index,0:40]
  y_train , y_test = y.iloc[train_index], y.iloc[test_index]
     
  model.fit(X_train,y_train)
  pred_values = model.predict(X_test)
     
  acc = accuracy_score(pred_values , y_test)  #accuracy
  #print(classification_report(y_test, pred_values, digits=3, target_names= target_N))
  precision,recall,fscore,support=score(y_test, pred_values,average='macro')

  print ('Results for fold #', n)
  print ('Precision : ', precision)
  print ('Recall    : ', recall)
  print ('F-score   : ', fscore)
  print ('accuracy : ', acc, '\n')
  n = n + 1

  acc_score.append(acc)
  f1_sc.append(fscore)
  recall_sc.append(recall)
  precision_sc.append(precision)   
                  

avg_acc_score = sum(acc_score)/k
avg_f1_sc = sum(f1_sc)/k
avg_recall_sc = sum(recall_sc)/k
avg_precision_sc = sum(precision_sc)/k

print('\nAvg precision : {}\n'.format(avg_precision_sc))

print('Avg recall : {}\n'.format(avg_recall_sc))

print('Avg f1 score : {}\n'.format(avg_f1_sc))

print('Avg accuracy : {}\n'.format(avg_acc_score))



Results for fold # 1
Precision :  0.7439627246726246
Recall    :  0.7893208455311893
F-score   :  0.743588020276513
accuracy :  0.7718573476098137 





Results for fold # 2
Precision :  0.7492229416104443
Recall    :  0.7845740506434694
F-score   :  0.7515022713324007
accuracy :  0.7885507124188517 



  _warn_prf(average, modifier, msg_start, len(result))


Results for fold # 3
Precision :  0.672000725184878
Recall    :  0.6691925434328532
F-score   :  0.5942137477087096
accuracy :  0.6385633589073434 





Results for fold # 4
Precision :  0.8375992962497304
Recall    :  0.7816210622863892
F-score   :  0.7462527691135085
accuracy :  0.7856841750274007 

Results for fold # 5
Precision :  0.7513702321587371
Recall    :  0.7601504280240585
F-score   :  0.730519516758136
accuracy :  0.756239460370995 


Avg precision : 0.7508311839752829

Avg recall : 0.7569717859835918

Avg f1 score : 0.7132152650378536

Avg accuracy : 0.7481790108668809



