In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore, ttest_ind, mannwhitneyu
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
from sklearn.exceptions import DataConversionWarning,ConvergenceWarning

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [3]:
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from scipy.cluster.hierarchy import linkage, cophenet, dendrogram
from scipy.spatial.distance import pdist

In [4]:
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
from sklearn.feature_selection import RFE
# https://machinelearningmastery.com/smote-oversampling-for-imbalanced-classification/
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from numpy import where
from sklearn.utils import resample
import eli5
from eli5.sklearn import PermutationImportance



In [5]:
train = pd.read_csv(r'C:\Users\anura\Desktop\Capstone Project\Fraud Detection\Training_Test\UNSW_NB15_training-set.csv')
train.head(2)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0


In [6]:
dfc = train.copy()

In [7]:
test = pd.read_csv(r'C:\Users\anura\Desktop\Capstone Project\Fraud Detection\Training_Test\UNSW_NB15_testing-set.csv')
test.head(2)

Unnamed: 0,id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,1.1e-05,udp,-,INT,2,0,496,0,90909.0902,...,1,2,0,0,0,1,2,0,Normal,0
1,2,8e-06,udp,-,INT,2,0,1762,0,125000.0003,...,1,2,0,0,0,1,2,0,Normal,0


In [8]:
train.shape, test.shape

((175341, 45), (82332, 45))

In [9]:
train.drop('id', axis = 1, inplace = True)
test.drop('id', axis = 1, inplace = True)
dfc.drop('id', axis = 1, inplace = True)

In [10]:
df_cat = train.select_dtypes(include = 'object')
df_num = train.select_dtypes(exclude = 'object')

In [11]:
for i in df_cat.columns:
    print(i, df_cat[i].nunique())

proto 133
service 13
state 9
attack_cat 10


In [12]:
warnings.filterwarnings('ignore')

In [13]:
for i in ['proto', 'service', 'state']:
    dic = dict(df_cat.groupby(i)['attack_cat'].count()/df_cat.shape[0])
    df_cat[i] = df_cat[i].map(dic)
df_cat

Unnamed: 0,proto,service,state,attack_cat
0,0.455946,0.537056,0.443849,Normal
1,0.455946,0.537056,0.443849,Normal
2,0.455946,0.537056,0.443849,Normal
3,0.455946,0.019550,0.443849,Normal
4,0.455946,0.537056,0.443849,Normal
...,...,...,...,...
175336,0.360914,0.269726,0.469229,Generic
175337,0.455946,0.537056,0.443849,Shellcode
175338,0.360914,0.269726,0.469229,Generic
175339,0.360914,0.269726,0.469229,Generic


In [14]:
df_cat['attack_cat'].value_counts()

Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: attack_cat, dtype: int64

In [15]:
atk_lab = {'Normal' : 0, 'Generic' : 1, 'Exploits' : 2, 'Fuzzers' : 3, 'DoS' : 4, 'Reconnaissance' : 5,
          'Analysis' : 6, 'Backdoor' : 7, 'Shellcode' : 8, 'Worms' : 9}
df_cat['attack_label'] = df_cat['attack_cat'].map(atk_lab)

In [16]:
for i in ['proto', 'service', 'state']:
    dic = dict(test.groupby(i)['attack_cat'].count()/test.shape[0])
    test[i] = test[i].map(dic)
test

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,0.000011,0.357309,0.572718,0.414942,2,0,496,0,90909.090200,254,...,1,2,0,0,0,1,2,0,Normal,0
1,0.000008,0.357309,0.572718,0.414942,2,0,1762,0,125000.000300,254,...,1,2,0,0,0,1,2,0,Normal,0
2,0.000005,0.357309,0.572718,0.414942,2,0,1068,0,200000.005100,254,...,1,3,0,0,0,1,3,0,Normal,0
3,0.000006,0.357309,0.572718,0.414942,2,0,900,0,166666.660800,254,...,1,3,0,0,0,2,3,0,Normal,0
4,0.000010,0.357309,0.572718,0.414942,2,0,2126,0,100000.002500,254,...,1,3,0,0,0,2,3,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82327,0.000005,0.357309,0.572718,0.414942,2,0,104,0,200000.005100,254,...,1,2,0,0,0,2,1,0,Normal,0
82328,1.106101,0.523430,0.572718,0.477809,20,8,18062,354,24.410067,254,...,1,1,0,0,0,3,2,0,Normal,0
82329,0.000000,0.011988,0.572718,0.414942,1,0,46,0,0.000000,0,...,1,1,0,0,0,1,1,1,Normal,0
82330,0.000000,0.011988,0.572718,0.414942,1,0,46,0,0.000000,0,...,1,1,0,0,0,1,1,1,Normal,0


In [17]:
test['attack_label'] = test['attack_cat'].map(atk_lab)
test.head(2)

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label,attack_label
0,1.1e-05,0.357309,0.572718,0.414942,2,0,496,0,90909.0902,254,...,2,0,0,0,1,2,0,Normal,0,0
1,8e-06,0.357309,0.572718,0.414942,2,0,1762,0,125000.0003,254,...,2,0,0,0,1,2,0,Normal,0,0


In [18]:
df = pd.concat([df_num, df_cat], axis = 1)
df.head(2)

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label,proto,service,state,attack_cat,attack_label
0,0.121478,6,4,258,172,74.08749,252,254,14158.94238,8495.365234,...,0,1,1,0,0,0.455946,0.537056,0.443849,Normal,0
1,0.649902,14,38,734,42014,78.473372,62,252,8395.112305,503571.3125,...,0,1,6,0,0,0.455946,0.537056,0.443849,Normal,0


## -----------------------------------------------------------------------------

In [19]:
print(atk_lab)

{'Normal': 0, 'Generic': 1, 'Exploits': 2, 'Fuzzers': 3, 'DoS': 4, 'Reconnaissance': 5, 'Analysis': 6, 'Backdoor': 7, 'Shellcode': 8, 'Worms': 9}


In [20]:
df['attack_cat'].value_counts()

Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: attack_cat, dtype: int64

In [21]:
df['attack_label'] = df['attack_label'].replace({4 : 3, 8 : 2, 9 : 2, 7 : 5, 6 : 5})
test['attack_label'] = test['attack_label'].replace({4 : 3, 8 : 2, 9 : 2, 7 : 5, 6 : 5})

In [22]:
df['attack_label'] = df['attack_label'].replace({5:4})
test['attack_label'] = test['attack_label'].replace({5:4})

In [23]:
df['attack_label'].value_counts()

0    56000
1    40000
2    34656
3    30448
4    14237
Name: attack_label, dtype: int64

In [24]:
Xtrain = df.drop(['attack_cat', 'attack_label', 'label'], axis = 1)
Ytrain = df['attack_label']
Xtest = test.drop(['attack_cat', 'attack_label', 'label'], axis = 1)
Ytest = test['attack_label']

In [25]:
Xtr_cols = Xtrain.columns
Xtest = Xtest.loc[:, Xtr_cols]

In [26]:
Xtrain.shape, Xtest.shape

((175341, 42), (82332, 42))

## -------------------------------------------------------------------------------------------

In [27]:
xgb = XGBClassifier(random_state = 0)
xgb.fit(Xtrain, Ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
ypred_train = xgb.predict(Xtrain)
ypred_test = xgb.predict(Xtest)

In [29]:
print('Train')
print(classification_report(Ytrain, ypred_train))
print('_'*70)
print('Test')
print(classification_report(Ytest, ypred_test))

Train
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     56000
           1       1.00      0.99      0.99     40000
           2       0.68      0.90      0.77     34656
           3       0.72      0.60      0.66     30448
           4       0.95      0.63      0.76     14237

    accuracy                           0.86    175341
   macro avg       0.86      0.81      0.83    175341
weighted avg       0.87      0.86      0.86    175341

______________________________________________________________________
Test
              precision    recall  f1-score   support

           0       0.97      0.76      0.85     37000
           1       1.00      0.97      0.98     18871
           2       0.62      0.83      0.71     11554
           3       0.35      0.46      0.40     10151
           4       0.53      0.67      0.60      4756

    accuracy                           0.78     82332
   macro avg       0.69      0.74      0.71     8

In [30]:
pd.DataFrame({'Features' : Xtrain.columns,
              'importance' : xgb.feature_importances_}).sort_values(by = 'importance')

Unnamed: 0,Features,importance
34,ct_ftp_cmd,0.0
19,dwin,0.0
17,stcpb,0.000536
18,dtcpb,0.000559
22,ackdat,0.000619
29,ct_dst_ltm,0.000829
36,ct_src_ltm,0.000844
13,dinpkt,0.000917
12,sinpkt,0.000941
33,is_ftp_login,0.000955


In [31]:
over = SMOTE(sampling_strategy = {4 : 18000})
X, y = over.fit_resample(Xtrain, Ytrain)
y.value_counts()

0    56000
1    40000
2    34656
3    30448
4    18000
Name: attack_label, dtype: int64

In [32]:
under = RandomUnderSampler(sampling_strategy = {0 : 50000, 1 : 35000, 2 : 30000, 3 : 26000})
X, y = under.fit_resample(X, y)
y.value_counts()

0    50000
1    35000
2    30000
3    26000
4    18000
Name: attack_label, dtype: int64

In [33]:
df_smpl = pd.concat([X,y], axis = 1)

In [34]:
df_smpl = resample(df_smpl, n_samples = 50000)

In [35]:
Xs = df_smpl.drop('attack_label', axis = 1)
Ys = df_smpl['attack_label']

In [36]:
xgb = XGBClassifier(random_state = 0)
xgb.fit(Xs, Ys)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [37]:
ypred_train = xgb.predict(Xs)
ypred_test = xgb.predict(Xtest)

In [38]:
print('Train')
print(classification_report(Ys, ypred_train))
print('_'*70)
print('Test')
print(classification_report(Ytest, ypred_test))

Train
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     15792
           1       1.00      0.99      0.99     11099
           2       0.72      0.90      0.80      9432
           3       0.76      0.71      0.73      8200
           4       0.96      0.69      0.80      5477

    accuracy                           0.89     50000
   macro avg       0.88      0.85      0.86     50000
weighted avg       0.90      0.89      0.89     50000

______________________________________________________________________
Test
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     37000
           1       1.00      0.97      0.98     18871
           2       0.68      0.73      0.71     11554
           3       0.35      0.48      0.40     10151
           4       0.44      0.75      0.55      4756

    accuracy                           0.77     82332
   macro avg       0.68      0.74      0.70     8

In [39]:
pd.DataFrame({'Features' : Xs.columns,
              'importance' : xgb.feature_importances_}).sort_values(by = 'importance')

Unnamed: 0,Features,importance
34,ct_ftp_cmd,0.0
19,dwin,0.0
33,is_ftp_login,0.000308
29,ct_dst_ltm,0.000999
18,dtcpb,0.001095
17,stcpb,0.001171
36,ct_src_ltm,0.00122
0,dur,0.001283
22,ackdat,0.001367
12,sinpkt,0.001416


In [51]:
def modl(modl_nm, X_tr, y_tr, X_ts, y_ts):
    modl_nm.fit(X_tr, y_tr)
    print('Train model')
    ypred_train = modl_nm.predict(X_tr)
    ypred_test = modl_nm.predict(X_ts)
    print(classification_report(y_tr, ypred_train))
    print('_'*70)
    print('Test model')
    print(classification_report(y_ts, ypred_test))
    print('_'*70)
    display(pd.DataFrame({'Features' : X_tr.columns,
              'importance' : modl_nm.feature_importances_}).sort_values(by = 'importance').head(10))
    display(pd.DataFrame({'Features' : X_tr.columns,
              'importance' : modl_nm.feature_importances_}).sort_values(by = 'importance').tail(4))

In [52]:
inp = Xs.drop(['ct_ftp_cmd', 'dwin'], axis = 1)
inpt = Xtest.drop(['ct_ftp_cmd', 'dwin'], axis = 1)

In [54]:
xgb = XGBClassifier(random_state = 0, n_jobs = -1)
modl(xgb, inp, Ys, inpt, Ytest)

Train model
              precision    recall  f1-score   support

           0       0.98      0.97      0.98     15594
           1       1.00      0.99      1.00     11232
           2       0.70      0.92      0.80      9439
           3       0.79      0.68      0.73      8137
           4       0.96      0.71      0.82      5598

    accuracy                           0.89     50000
   macro avg       0.89      0.85      0.86     50000
weighted avg       0.90      0.89      0.89     50000

______________________________________________________________________
Test model
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     37000
           1       1.00      0.97      0.98     18871
           2       0.66      0.75      0.70     11554
           3       0.34      0.45      0.39     10151
           4       0.44      0.76      0.56      4756

    accuracy                           0.77     82332
   macro avg       0.68      0.74    

Unnamed: 0,Features,importance
16,swin,0.0
28,ct_dst_ltm,0.001096
17,stcpb,0.001173
18,dtcpb,0.001315
34,ct_src_ltm,0.001342
21,ackdat,0.001415
32,is_ftp_login,0.001425
12,sinpkt,0.001481
20,synack,0.001694
15,djit,0.001738


Unnamed: 0,Features,importance
36,is_sm_ips_ports,0.105836
7,dttl,0.117358
6,sttl,0.238178
30,ct_dst_sport_ltm,0.310003


In [55]:
inp = inp.drop(['swin', 'ct_dst_ltm', 'stcpb', 'dtcpb', 'is_ftp_login', 'ct_src_ltm',
               'sinpkt', 'ackdat'], axis = 1)
inpt = inpt.drop(['swin', 'ct_dst_ltm', 'stcpb', 'dtcpb', 'is_ftp_login', 'ct_src_ltm',
                 'sinpkt', 'ackdat'], axis = 1)

In [56]:
xgb = XGBClassifier(random_state = 0, n_jobs = -1)
modl(xgb, inp, Ys, inpt, Ytest)

Train model
              precision    recall  f1-score   support

           0       0.98      0.96      0.97     15594
           1       1.00      0.99      1.00     11232
           2       0.69      0.93      0.79      9439
           3       0.79      0.65      0.71      8137
           4       0.95      0.70      0.81      5598

    accuracy                           0.88     50000
   macro avg       0.88      0.85      0.86     50000
weighted avg       0.90      0.88      0.88     50000

______________________________________________________________________
Test model
              precision    recall  f1-score   support

           0       0.96      0.76      0.85     37000
           1       1.00      0.97      0.98     18871
           2       0.65      0.77      0.70     11554
           3       0.34      0.44      0.38     10151
           4       0.45      0.76      0.56      4756

    accuracy                           0.77     82332
   macro avg       0.68      0.74    

Unnamed: 0,Features,importance
23,ct_src_dport_ltm,0.001532
16,synack,0.001673
14,djit,0.001679
15,tcprtt,0.001724
0,dur,0.001837
8,sload,0.001843
12,dinpkt,0.002068
9,dload,0.002213
1,spkts,0.002311
13,sjit,0.002439


Unnamed: 0,Features,importance
28,is_sm_ips_ports,0.093243
7,dttl,0.131776
6,sttl,0.265289
24,ct_dst_sport_ltm,0.283554


In [57]:
inp = inp.drop(['ct_src_dport_ltm', 'synack', 'djit', 'tcprtt', 'dur', 'dinpkt',
               'sload'], axis = 1)
inpt = inpt.drop(['ct_src_dport_ltm', 'synack', 'djit', 'tcprtt', 'dur', 'dinpkt',
                 'sload'], axis = 1)

In [58]:
xgb = XGBClassifier(random_state = 0, n_jobs = -1)
modl(xgb, inp, Ys, inpt, Ytest)

Train model
              precision    recall  f1-score   support

           0       0.97      0.95      0.96     15594
           1       1.00      0.99      1.00     11232
           2       0.68      0.93      0.79      9439
           3       0.77      0.63      0.69      8137
           4       0.94      0.69      0.80      5598

    accuracy                           0.88     50000
   macro avg       0.87      0.84      0.85     50000
weighted avg       0.89      0.88      0.87     50000

______________________________________________________________________
Test model
              precision    recall  f1-score   support

           0       0.97      0.77      0.86     37000
           1       1.00      0.97      0.98     18871
           2       0.66      0.77      0.71     11554
           3       0.36      0.47      0.41     10151
           4       0.45      0.75      0.57      4756

    accuracy                           0.77     82332
   macro avg       0.69      0.74    

Unnamed: 0,Features,importance
4,rate,0.002205
10,sjit,0.002352
15,ct_srv_src,0.002494
14,response_body_len,0.002696
7,dload,0.002697
13,trans_depth,0.003088
0,spkts,0.003764
9,dloss,0.004712
1,dpkts,0.004744
24,state,0.006216


Unnamed: 0,Features,importance
21,is_sm_ips_ports,0.113706
17,ct_dst_sport_ltm,0.187643
6,dttl,0.190956
5,sttl,0.283428


In [59]:
inp.shape

(50000, 25)

In [60]:
inp = inp.drop(['rate', 'sjit', 'ct_srv_src', 'response_body_len',
               'dload', 'trans_depth', 'spkts'], axis = 1)
inpt = inpt.drop(['rate', 'sjit', 'ct_srv_src', 'response_body_len',
                 'dload', 'trans_depth', 'spkts'], axis = 1)

In [61]:
xgb = XGBClassifier(random_state = 0, n_jobs = -1)
modl(xgb, inp, Ys, inpt, Ytest)

Train model
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     15594
           1       1.00      0.99      0.99     11232
           2       0.67      0.91      0.77      9439
           3       0.73      0.60      0.66      8137
           4       0.92      0.68      0.78      5598

    accuracy                           0.86     50000
   macro avg       0.86      0.82      0.83     50000
weighted avg       0.87      0.86      0.86     50000

______________________________________________________________________
Test model
              precision    recall  f1-score   support

           0       0.97      0.77      0.86     37000
           1       1.00      0.97      0.98     18871
           2       0.64      0.77      0.70     11554
           3       0.36      0.44      0.39     10151
           4       0.44      0.78      0.56      4756

    accuracy                           0.77     82332
   macro avg       0.68      0.74    

Unnamed: 0,Features,importance
0,dpkts,0.004292
6,dloss,0.004665
17,state,0.005789
12,ct_flw_http_mthd,0.006093
11,ct_dst_src_ltm,0.006332
5,sloss,0.008994
2,dbytes,0.009035
7,smean,0.010513
8,dmean,0.012077
1,sbytes,0.012217


Unnamed: 0,Features,importance
4,dttl,0.130993
14,is_sm_ips_ports,0.171212
10,ct_dst_sport_ltm,0.193692
3,sttl,0.305247


In [66]:
perm = PermutationImportance(xgb, random_state = 0, scoring = 'f1_weighted', n_iter = 10).fit(inp, Ys)
eli5.show_weights(perm, feature_names = inp.columns.tolist())

Weight,Feature
0.2258  ± 0.0022,sttl
0.2123  ± 0.0026,sbytes
0.0555  ± 0.0016,smean
0.0404  ± 0.0015,dbytes
0.0352  ± 0.0009,service
0.0331  ± 0.0021,ct_srv_dst
0.0313  ± 0.0013,proto
0.0294  ± 0.0019,ct_dst_src_ltm
0.0232  ± 0.0016,ct_dst_sport_ltm
0.0175  ± 0.0020,dmean


In [67]:
perm = PermutationImportance(xgb, random_state = 0, scoring = 'f1_weighted', n_iter = 10).fit(inpt, Ytest)
eli5.show_weights(perm, feature_names = inp.columns.tolist())

Weight,Feature
0.1919  ± 0.0031,sttl
0.1893  ± 0.0019,sbytes
0.0569  ± 0.0009,service
0.0360  ± 0.0015,ct_dst_src_ltm
0.0316  ± 0.0011,smean
0.0314  ± 0.0013,dbytes
0.0222  ± 0.0011,ct_dst_sport_ltm
0.0177  ± 0.0013,ct_srv_dst
0.0123  ± 0.0006,dmean
0.0064  ± 0.0007,dloss


In [40]:
def modl(modl_nm, X_tr, y_tr, X_ts, y_ts):
    modl_nm.fit(X_tr, y_tr)
    print('Train model')
    ypred_train = modl_nm.predict(X_tr)
    ypred_test = modl_nm.predict(X_ts)
    print(classification_report(y_tr, ypred_train))
    print('_'*70)
    print('Test model')
    print(classification_report(y_ts, ypred_test))

In [41]:
kf = KFold(n_splits = 5, shuffle = True, random_state = 0)

In [None]:
for i in range(42, 30, -1):
    print('Features :', i)
    xgb = XGBClassifier(random_state = 0)
    back_mod = sfs(estimator = xgb, k_features = i, forward = 'False',
                   cv = kf, scoring = 'f1_weighted', n_jobs = -1)
    back_mod.fit(Xs, Ys)
    feat_back = list(back_mod.k_feature_names_)
    print(feat_back)
    modl(xgb, Xs[feat_back], Ys, Xtest[feat_back], Ytest)
    print('#'*70)

Features : 42
['dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'proto', 'service', 'state']
Train model
              precision    recall  f1-score   support

           0       0.98      0.97      0.97     15792
           1       1.00      0.99      0.99     11099
           2       0.72      0.90      0.80      9432
           3       0.76      0.71      0.73      8200
           4       0.96      0.69      0.80      5477

    accuracy                           0.89     50000
   macro avg       0.88      0.85      0.86     50000
weighted avg       0.90      0.89      0.89     5000