# CASE STUDY - NETWORKING
# NETWORK INTRUSION DETECTION

In [101]:
#Packages related to general operating system & warnings
import os 
import warnings
warnings.filterwarnings('ignore')

#Packages related to data importing, manipulation, exploratory data analysis, data understanding
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import pandas_profiling
import scipy.stats as stats

#Packages related to data visualizaiton
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Setting plot sizes and type of plot
plt.rc("font", size=14)
plt.rcParams['axes.grid'] = True
plt.figure(figsize=(6,3))
plt.gray()

from matplotlib.backends.backend_pdf import PdfPages

#Modules related to split the data & gridsearch
from sklearn.model_selection import train_test_split, GridSearchCV

#Module related to calculation of metrics
from sklearn import metrics

#Module related to VIF 
from statsmodels.stats.outliers_influence import variance_inflation_factor

#Modules related to preprocessing (Imputation of missings, standardiszation, new features creation, converting categorical to numerical)
from sklearn.impute import MissingIndicator, SimpleImputer
#from sklearn.preprocessing import Imputer, PolynomialFeatures, KBinsDiscretizer, FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder

#Moudles related to feature selection
from sklearn.feature_selection import RFE, RFECV, SelectKBest, chi2, SelectPercentile, f_classif, mutual_info_classif, f_regression, VarianceThreshold, SelectFromModel, mutual_info_classif, mutual_info_regression, SelectFpr, SelectFdr, SelectFwe
import copy

#Modules related to pipe line creation for faster processing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
#from sklearn.features.transformers import DataFrameSelector

#Modules related to saving python objects permanantly
from sklearn.externals import joblib

#Dumping model into current directory: joblib.dump(model_xg,"my_model.pkl") 
#Loading model: my_model_loaded=joblib.load("my_model.pkl")

#Modules related key techniques of supervised learning 
import statsmodels.formula.api as smf
import statsmodels.tsa as tsa

from sklearn.linear_model import LogisticRegression, LinearRegression, ElasticNet, Lasso, Ridge
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz, export
from sklearn.ensemble import BaggingClassifier, BaggingRegressor,RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier,GradientBoostingRegressor, AdaBoostClassifier, AdaBoostRegressor 
from xgboost import XGBClassifier, XGBRegressor
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from patsy import dmatrices

from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor

<Figure size 432x216 with 0 Axes>

### Importing Data

In [76]:
train=pd.read_csv('NSL_Dataset/Train.txt')
test=pd.read_csv('NSL_Dataset/Test.txt')

In [77]:
extra_row_tr = list(train.columns)
extra_row_te= list(test.columns)

In [78]:
columns_list=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land", "wrong_fragment","urgent","hot","num_failed_logins","logged_in", "num_compromised","root_shell","su_attempted","num_root","num_file_creations", "num_shells","num_access_files","num_outbound_cmds","is_host_login", "is_guest_login","count","srv_count","serror_rate", "srv_serror_rate", "rerror_rate","srv_rerror_rate","same_srv_rate", "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count","dst_host_same_srv_rate", "dst_host_diff_srv_rate","dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate", "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]
train.columns=columns_list
test.columns=columns_list
# train.loc[train.shape[0]] = (extra_row_tr)
# test.loc[test.shape[0]] = (extra_row_te)

In [79]:
train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
1,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
2,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
3,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
4,0,tcp,private,REJ,0,0,0,0,0,0,...,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.00,neptune,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20
125968,8,udp,private,SF,105,145,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21
125969,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18
125970,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20


In [80]:
test

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint,15
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan,11
4,0,tcp,http,SF,267,14515,0,0,0,0,...,1.00,0.00,0.01,0.03,0.01,0.0,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22539,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back,15
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal,21


### Creating Y-Variable as per problem statement

In [81]:
train['binomial_attack'] = np.where(train.attack == 'normal', 0 , 1)
train['multinomial_attack'] = 4
train['multinomial_attack'] = np.where(train.attack == 'normal', 0, train['multinomial_attack'])
train['multinomial_attack'] = np.where(((train.attack=='back') | (train.attack=='land') | (train.attack=='pod') | (train.attack=='neptune') | 
         (train.attack=='smurf') | (train.attack=='teardrop') | (train.attack=='apache2') | (train.attack=='udpstorm') | 
         (train.attack=='processtable') | (train.attack=='worm')), 1, train['multinomial_attack'])
train['multinomial_attack'] = np.where((train.attack=='satan') | (train.attack=='ipsweep') | (train.attack=='nmap') | (train.attack=='portsweep') | 
          (train.attack=='mscan') | (train.attack=='saint'), 2, train['multinomial_attack'])
train['multinomial_attack'] = np.where((train.attack=='guess_passwd') | (train.attack=='ftp_write') | (train.attack=='imap') | (train.attack=='phf') | 
          (train.attack=='multihop') | (train.attack=='warezmaster') | (train.attack=='warezclient') | (train.attack=='spy') | 
          (train.attack=='xlock') | (train.attack=='xsnoop') | (train.attack=='snmpguess') | (train.attack=='snmpgetattack') | 
          (train.attack=='httptunnel') | (train.attack=='sendmail') | (train.attack=='named'), 3, train['multinomial_attack'])
print(train['multinomial_attack'].value_counts())
print("="*25)
print(train['binomial_attack'].value_counts())

0    67342
1    45927
2    11656
3      995
4       52
Name: multinomial_attack, dtype: int64
0    67342
1    58630
Name: binomial_attack, dtype: int64


In [82]:
test['binomial_attack'] = np.where(test.attack == 'normal', 0 , 1)
test['multinomial_attack'] = 4
test['multinomial_attack'] = np.where(test.attack == 'normal', 0, test['multinomial_attack'])
test['multinomial_attack'] = np.where(((test.attack=='back') | (test.attack=='land') | (test.attack=='pod') | (test.attack=='neptune') | 
         (test.attack=='smurf') | (test.attack=='teardrop') | (test.attack=='apache2') | (test.attack=='udpstorm') | 
         (test.attack=='processtable') | (test.attack=='worm')), 1, test['multinomial_attack'])
test['multinomial_attack'] = np.where((test.attack=='satan') | (test.attack=='ipsweep') | (test.attack=='nmap') | (test.attack=='portsweep') | 
          (test.attack=='mscan') | (test.attack=='saint'), 2, test['multinomial_attack'])
test['multinomial_attack'] = np.where((test.attack=='guess_passwd') | (test.attack=='ftp_write') | (test.attack=='imap') | (test.attack=='phf') | 
          (test.attack=='multihop') | (test.attack=='warezmaster') | (test.attack=='warezclient') | (test.attack=='spy') | 
          (test.attack=='xlock') | (test.attack=='xsnoop') | (test.attack=='snmpguess') | (test.attack=='snmpgetattack') | 
          (test.attack=='httptunnel') | (test.attack=='sendmail') | (test.attack=='named'), 3, test['multinomial_attack'])
print(test['multinomial_attack'].value_counts())
print("="*25)
print(test['binomial_attack'].value_counts())

0    9711
1    7166
3    2885
2    2421
4     360
Name: multinomial_attack, dtype: int64
1    12832
0     9711
Name: binomial_attack, dtype: int64


In [83]:
train = train.drop(["attack"], axis=1)
test = test.drop(["attack"], axis=1)

In [84]:
print(sorted(columns_list))

['attack', 'count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count', 'dst_host_diff_srv_rate', 'dst_host_rerror_rate', 'dst_host_same_src_port_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate', 'duration', 'flag', 'hot', 'is_guest_login', 'is_host_login', 'land', 'last_flag', 'logged_in', 'num_access_files', 'num_compromised', 'num_failed_logins', 'num_file_creations', 'num_outbound_cmds', 'num_root', 'num_shells', 'protocol_type', 'rerror_rate', 'root_shell', 'same_srv_rate', 'serror_rate', 'service', 'src_bytes', 'srv_count', 'srv_diff_host_rate', 'srv_rerror_rate', 'srv_serror_rate', 'su_attempted', 'urgent', 'wrong_fragment']


In [85]:
cat_vars_cols = ["protocol_type", "service", "flag", "land", "logged_in", "root_shell", "su_attempted", "is_host_login", "is_guest_login", "binomial_attack", "multinomial_attack"]
num_vars_cols = list(train.columns.difference(cat_vars_cols))
print(cat_vars_cols)
print(num_vars_cols)

['protocol_type', 'service', 'flag', 'land', 'logged_in', 'root_shell', 'su_attempted', 'is_host_login', 'is_guest_login', 'binomial_attack', 'multinomial_attack']
['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count', 'dst_host_diff_srv_rate', 'dst_host_rerror_rate', 'dst_host_same_src_port_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_count', 'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate', 'duration', 'hot', 'last_flag', 'num_access_files', 'num_compromised', 'num_failed_logins', 'num_file_creations', 'num_outbound_cmds', 'num_root', 'num_shells', 'rerror_rate', 'same_srv_rate', 'serror_rate', 'src_bytes', 'srv_count', 'srv_diff_host_rate', 'srv_rerror_rate', 'srv_serror_rate', 'urgent', 'wrong_fragment']


In [86]:
num_train = train[num_vars_cols]
cat_train = train[cat_vars_cols]

### Data Audit

In [87]:
def var_summary(x):
    return pd.Series([x.count(), x.isnull().sum(), x.sum(), x.mean(), x.median(),  x.std(), x.var(), x.min(), x.dropna().quantile(0.01), x.dropna().quantile(0.05),x.dropna().quantile(0.10),x.dropna().quantile(0.25),x.dropna().quantile(0.50),x.dropna().quantile(0.75), x.dropna().quantile(0.90),x.dropna().quantile(0.95), x.dropna().quantile(0.99),x.max()], 
                  index=['N', 'NMISS', 'SUM', 'MEAN','MEDIAN', 'STD', 'VAR','MIN', 'P1' , 'P5' ,'P10' ,'P25' ,'P50' ,'P75' ,'P90' ,'P95' ,'P99' ,'MAX'])

def cat_summary(x):
    Mode = x.value_counts().sort_values(ascending = False).index[0]
    return pd.Series([x.count(), x.isnull().sum(), Mode], 
                  index=['N', 'NMISS', 'Mode'])

In [88]:
num_audit = num_train.apply(var_summary).T
num_audit.to_csv("num_audit.csv")
num_audit

Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
count,125972.0,0.0,10595280.0,84.108207,14.0,114.5088,13112.27,0.0,1.0,1.0,1.0,2.0,14.0,143.0,256.0,286.0,511.0,511.0
diff_srv_rate,125972.0,0.0,7942.93,0.063053,0.0,0.180315,0.03251351,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.07,0.29,1.0,1.0
dst_bytes,125972.0,0.0,2491634000.0,19779.271433,0.0,4021285.0,16170730000000.0,0.0,0.0,0.0,0.0,0.0,0.0,516.0,3375.9,8314.0,25519.0,1309937000.0
dst_host_count,125972.0,0.0,22945700.0,182.1492,255.0,99.20657,9841.943,0.0,1.0,3.0,11.0,82.0,255.0,255.0,255.0,255.0,255.0,255.0
dst_host_diff_srv_rate,125972.0,0.0,10449.57,0.082952,0.02,0.1889225,0.03569171,0.0,0.0,0.0,0.0,0.0,0.02,0.07,0.1,0.56,1.0,1.0
dst_host_rerror_rate,125972.0,0.0,14969.55,0.118832,0.0,0.3065586,0.09397818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,1.0,1.0,1.0
dst_host_same_src_port_rate,125972.0,0.0,18691.56,0.148379,0.0,0.3089984,0.09547998,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.84,1.0,1.0,1.0
dst_host_same_srv_rate,125972.0,0.0,65662.21,0.521244,0.51,0.4489501,0.2015562,0.0,0.0,0.0,0.01,0.05,0.51,1.0,1.0,1.0,1.0,1.0
dst_host_serror_rate,125972.0,0.0,35833.33,0.284455,0.0,0.4447851,0.1978338,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0
dst_host_srv_count,125972.0,0.0,14569130.0,115.653725,63.0,110.7029,12255.13,0.0,1.0,1.0,2.0,10.0,63.0,255.0,255.0,255.0,255.0,255.0


In [89]:
cat_audit = cat_train.apply(cat_summary).T
cat_audit.to_csv("cat_audit.csv")
cat_audit

Unnamed: 0,N,NMISS,Mode
protocol_type,125972,0,tcp
service,125972,0,http
flag,125972,0,SF
land,125972,0,0
logged_in,125972,0,0
root_shell,125972,0,0
su_attempted,125972,0,0
is_host_login,125972,0,0
is_guest_login,125972,0,0
binomial_attack,125972,0,0


In [90]:
num_test = test[num_vars_cols]
cat_test = test[cat_vars_cols]

In [91]:
num_audit = num_test.apply(var_summary).T
num_audit.to_csv("num_audit_test.csv")
num_audit

Unnamed: 0,N,NMISS,SUM,MEAN,MEDIAN,STD,VAR,MIN,P1,P5,P10,P25,P50,P75,P90,P95,P99,MAX
count,22543.0,0.0,1781386.0,79.021692,8.0,128.538218,16522.07,0.0,1.0,1.0,1.0,1.0,8.0,123.0,256.0,400.9,511.0,511.0
diff_srv_rate,22543.0,0.0,2120.74,0.094075,0.0,0.259143,0.06715523,0.0,0.0,0.0,0.0,0.0,0.0,0.06,0.09,1.0,1.0,1.0
dst_bytes,22543.0,0.0,46350890.0,2056.110012,46.0,21219.763847,450278400.0,0.0,0.0,0.0,0.0,0.0,46.0,601.0,4099.4,8314.0,26032.76,1345927.0
dst_host_count,22543.0,0.0,4370337.0,193.866699,255.0,94.036867,8842.932,0.0,1.0,6.0,20.0,121.0,255.0,255.0,255.0,255.0,255.0,255.0
dst_host_diff_srv_rate,22543.0,0.0,2041.07,0.090541,0.01,0.220722,0.04871805,0.0,0.0,0.0,0.0,0.0,0.01,0.06,0.15,0.75,1.0,1.0
dst_host_rerror_rate,22543.0,0.0,5260.43,0.233351,0.0,0.387204,0.1499267,0.0,0.0,0.0,0.0,0.0,0.0,0.36,1.0,1.0,1.0,1.0
dst_host_same_src_port_rate,22543.0,0.0,2981.69,0.132267,0.0,0.306274,0.09380364,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.85,1.0,1.0,1.0
dst_host_same_srv_rate,22543.0,0.0,13722.98,0.608747,0.92,0.435681,0.1898182,0.0,0.0,0.0,0.02,0.07,0.92,1.0,1.0,1.0,1.0,1.0
dst_host_serror_rate,22543.0,0.0,2205.11,0.097818,0.0,0.273144,0.07460766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35,1.0,1.0,1.0
dst_host_srv_count,22543.0,0.0,3173070.0,140.756332,168.0,111.783059,12495.45,0.0,1.0,1.0,3.0,15.0,168.0,255.0,255.0,255.0,255.0,255.0


In [92]:
cat_audit = cat_test.apply(cat_summary).T
cat_audit.to_csv("cat_audit_test.csv")
cat_audit

Unnamed: 0,N,NMISS,Mode
protocol_type,22543,0,tcp
service,22543,0,http
flag,22543,0,SF
land,22543,0,0
logged_in,22543,0,0
root_shell,22543,0,0
su_attempted,22543,0,0
is_host_login,22543,0,0
is_guest_login,22543,0,0
binomial_attack,22543,0,1


### Handling Outlier

In [93]:
def outlier_capping(x):
    x = x.clip(upper=x.quantile(0.99))
    x = x.clip(lower=x.quantile(0.01))
    return x

In [94]:
num_train = num_train.apply(outlier_capping)
num_test = num_test.apply(outlier_capping)

In [95]:
train_final = pd.concat([num_train, cat_train],axis=1)
test_final = pd.concat([num_test, cat_test], axis=1)

### Creating dummy variables

In [96]:
def create_dummies( df, colname ):
    col_dummies = pd.get_dummies(df[colname], prefix=colname, drop_first=True)
    df = pd.concat([df, col_dummies], axis=1)
    df.drop( colname, axis = 1, inplace = True )
    return df

In [97]:
dummy_cols = ["protocol_type", "service", "flag"]
for colname in dummy_cols:
    train_final = create_dummies(train_final, colname)
    test_final = create_dummies(test_final, colname)

In [102]:
train_ml = copy.deepcopy(train_final)
train_final.to_csv("Filtered_train_data.csv")
train_final.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,13,0.15,0,255,0.6,0.0,0.88,0.0,0.0,1,...,0,0,0,0,0,0,0,0,1,0
1,123,0.07,0,255,0.05,0.0,0.0,0.1,1.0,26,...,0,0,0,0,1,0,0,0,0,0
2,5,0.0,8153,30,0.0,0.0,0.03,1.0,0.03,255,...,0,0,0,0,0,0,0,0,1,0
3,30,0.0,420,255,0.0,0.0,0.0,1.0,0.0,255,...,0,0,0,0,0,0,0,0,1,0
4,121,0.06,0,255,0.07,1.0,0.0,0.07,0.0,19,...,1,0,0,0,0,0,0,0,0,0


In [103]:
test_ml = copy.deepcopy(test_final)
test_final.to_csv("Filtered_test_data.csv")
test_final.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,136,0.06,0.0,255,0.06,1.0,0.0,0.0,0.0,1,...,1,0,0,0,0,0,0,0,0,0
1,1,0.0,0.0,134,0.04,0.0,0.61,0.61,0.0,86,...,0,0,0,0,0,0,0,0,1,0
2,1,0.0,0.0,3,0.0,0.0,1.0,1.0,0.0,57,...,0,0,0,0,0,0,0,0,1,0
3,1,0.0,15.0,29,0.17,0.83,0.03,0.31,0.0,86,...,0,1,0,0,0,0,0,0,0,0
4,4,0.0,14515.0,155,0.0,0.0,0.01,1.0,0.01,255,...,0,0,0,0,0,0,0,0,1,0


## Variable reduction
### RFE

In [104]:

X = train_final[train_final.columns.difference(['binomial_attack', 'multinomial_attack'])]
y = train_final[['binomial_attack']]



In [27]:
rfe = RFE(RandomForestClassifier(), 25)
rfe = rfe.fit(X, y)

In [28]:
X.columns

Index(['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count',
       'dst_host_diff_srv_rate', 'dst_host_rerror_rate',
       'dst_host_same_src_port_rate', 'dst_host_same_srv_rate',
       'dst_host_serror_rate', 'dst_host_srv_count',
       ...
       'service_vmnet', 'service_whois', 'src_bytes', 'srv_count',
       'srv_diff_host_rate', 'srv_rerror_rate', 'srv_serror_rate',
       'su_attempted', 'urgent', 'wrong_fragment'],
      dtype='object', length=120)

In [29]:
rfe_var = X.columns[rfe.get_support()]
rfe_var

Index(['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count',
       'dst_host_diff_srv_rate', 'dst_host_rerror_rate',
       'dst_host_same_src_port_rate', 'dst_host_same_srv_rate',
       'dst_host_serror_rate', 'dst_host_srv_count',
       'dst_host_srv_diff_host_rate', 'dst_host_srv_rerror_rate',
       'dst_host_srv_serror_rate', 'flag_SF', 'last_flag', 'logged_in',
       'protocol_type_tcp', 'same_srv_rate', 'service_eco_i', 'service_ecr_i',
       'service_http', 'service_private', 'src_bytes', 'srv_count',
       'srv_serror_rate'],
      dtype='object')

### SelectKbest

In [30]:

SKB = SelectKBest(f_classif, k=25).fit(X, y )

In [31]:
skb_var = X.columns[SKB.get_support()]
skb_var

Index(['count', 'diff_srv_rate', 'dst_bytes', 'dst_host_count',
       'dst_host_diff_srv_rate', 'dst_host_rerror_rate',
       'dst_host_same_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_count',
       'dst_host_srv_rerror_rate', 'dst_host_srv_serror_rate', 'flag_S0',
       'flag_SF', 'last_flag', 'logged_in', 'protocol_type_udp', 'rerror_rate',
       'same_srv_rate', 'serror_rate', 'service_domain_u', 'service_http',
       'service_private', 'service_smtp', 'srv_rerror_rate',
       'srv_serror_rate'],
      dtype='object')

In [32]:
superset = set(list(rfe_var) + list(skb_var))
len(superset)

32

### WOE and IV

In [33]:
#Information value calculation
def calculate_woe_iv(dataset, feature, target):
    lst = []
    for i in range(dataset[feature].nunique()):
        val = list(dataset[feature].unique())[i]
        lst.append({
            'Value': val,
            'All': dataset[dataset[feature] == val].count()[feature],
            'Good': dataset[(dataset[feature] == val) & (dataset[target] == 0)].count()[feature],
            'Bad': dataset[(dataset[feature] == val) & (dataset[target] == 1)].count()[feature]
        })
        
    dset = pd.DataFrame(lst)
    dset['Distr_Good'] = dset['Good'] / dset['Good'].sum()
    dset['Distr_Bad'] = dset['Bad'] / dset['Bad'].sum()
    dset['WoE'] = np.log(dset['Distr_Good'] / dset['Distr_Bad'])
    dset = dset.replace({'WoE': {np.inf: 0, -np.inf: 0}})
    dset['IV'] = (dset['Distr_Good'] - dset['Distr_Bad']) * dset['WoE']
    iv = dset['IV'].sum()
    
    dset = dset.sort_values(by='WoE')
    
    return dset, iv

In [34]:
for col in superset:
    print('WoE and IV for column: {}'.format(col))
    df, iv = calculate_woe_iv(train_final, col, 'binomial_attack')
    print(df)
    print('IV score: {:.2f}'.format(iv))
    print('\n')

WoE and IV for column: protocol_type_tcp
   Value     All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
1      1  102688  53599  49089    0.795922   0.837268 -0.050642  0.002094
0      0   23284  13743   9541    0.204078   0.162732  0.226394  0.009360
IV score: 0.01


WoE and IV for column: dst_host_srv_serror_rate
    Value    All  Good    Bad  Distr_Good  Distr_Bad       WoE        IV
1    1.00  34256    33  34223    0.000490   0.583711 -7.082683  4.130772
15   0.96     44     2     42    0.000030   0.000716 -3.183060  0.002186
26   0.98     53     5     48    0.000074   0.000819 -2.400301  0.001787
8    0.97     56     6     50    0.000089   0.000853 -2.258801  0.001725
44   0.89     11     2      9    0.000030   0.000154 -1.642615  0.000203
..    ...    ...   ...    ...         ...        ...       ...       ...
37   0.15     11    10      1    0.000148   0.000017  2.164047  0.000284
5    0.09     57    52      5    0.000772   0.000085  2.203268  0.001513
33   0.11     4

    Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
0      15   3990    230   3760    0.003415   0.064131 -2.932633  0.178057
9      11   1815    220   1595    0.003267   0.027205 -2.119539  0.050737
6      16   2393    307   2086    0.004559   0.035579 -2.054693  0.063737
7      12    729    121    608    0.001797   0.010370 -1.752922  0.015028
1      19  10284   1773   8511    0.026328   0.145165 -1.707224  0.202880
10     13    451    111    340    0.001648   0.005799 -1.257953  0.005221
3      18  20667   7076  13591    0.105076   0.231810 -0.791237  0.100277
4      20  19338   6885  12453    0.102239   0.212400 -0.731154  0.080544
5      17   3074   1133   1941    0.016825   0.033106 -0.676872  0.011020
8      14    674    322    352    0.004782   0.006004 -0.227617  0.000278
2      21  62557  49164  13393    0.730064   0.228433  1.161892  0.582842
IV score: 1.29


WoE and IV for column: logged_in
   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE

   Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
1      0  51028   3950  47078    0.058656   0.802968 -2.616628  1.947587
0      1  74944  63392  11552    0.941344   0.197032  1.563941  1.164060
IV score: 3.11


WoE and IV for column: service_eco_i
   Value     All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
1      1    4586    497   4089     0.00738   0.069742 -2.246003  0.140066
0      0  121386  66845  54541     0.99262   0.930258  0.064886  0.004046
IV score: 0.14


WoE and IV for column: dst_host_same_src_port_rate
    Value    All   Good    Bad  Distr_Good  Distr_Bad       WoE        IV
4    1.00  10307   2529   7778    0.037555   0.132662 -1.262013  0.120027
77   0.66     57     17     40    0.000252   0.000682 -0.994204  0.000427
71   0.65     79     25     54    0.000371   0.000921 -0.908646  0.000500
47   0.63     65     22     43    0.000327   0.000733 -0.808695  0.000329
1    0.00  63023  22433  40590    0.333120   0.692308 -0.731526  0.

### VIF

In [35]:
#Final List
Final_list = [
#               'same_srv_rate', 
              'dst_host_count', 
#               'protocol_type_tcp', 
              'dst_bytes', 
              'dst_host_rerror_rate', 
              'srv_count', 
              'dst_host_srv_diff_host_rate', 
#               'srv_rerror_rate', 
              'dst_host_same_src_port_rate', 
#               'serror_rate', 
              'service_http', 
              'service_private', 
#               'service_ecr_i', 
#               'flag_S0', 
#               'srv_serror_rate', 
              'logged_in', 
              'service_domain_u', 
              'dst_host_serror_rate', 
#               'dst_host_srv_rerror_rate', 
#               'service_eco_i', 
#               'service_smtp', 
#               'dst_host_same_srv_rate', 
#               'hot', 
#               'protocol_type_udp', 
              'dst_host_srv_count', 
              'dst_host_diff_srv_rate', 
              'diff_srv_rate', 
              'count', 
#               'last_flag', 
#               'dst_host_srv_serror_rate', 
#               'rerror_rate', 
#               'flag_SF', 
              'src_bytes'
             ]

In [36]:
# run the dmatrices
print(len(Final_list))
formula = "binomial_attack ~" + '+'.join(Final_list)
y, X_new = dmatrices(formula_like=formula, data =train_final, return_type = 'dataframe')

16


In [37]:
print(X_new.shape[1])


17


In [38]:
### VIF Calculation for variables
vif = pd.DataFrame()
vif["VIF_Factor"] = [variance_inflation_factor(X_new.values, i) for i in range(X_new.shape[1])]
vif["features"] = X_new.columns

In [39]:
vif.sort_values(by='VIF_Factor',ascending=False)

Unnamed: 0,VIF_Factor,features
0,29.431896,Intercept
12,5.544388,dst_host_srv_count
9,4.954534,logged_in
11,4.945978,dst_host_serror_rate
7,4.219763,service_http
15,3.321808,count
4,2.9367,srv_count
3,2.602483,dst_host_rerror_rate
10,2.357077,service_domain_u
6,2.199393,dst_host_same_src_port_rate


In [40]:
train_final1 = pd.concat([X_new, y], axis=1)

In [41]:
eqn = 'binomial_attack~ '+'+'.join(Final_list)
# TODO

In [42]:
#Implementation Model building
#Logistic Regression

logit_model = smf.logit(formula = eqn, data=train_final1).fit()

Optimization terminated successfully.
         Current function value: 0.115048
         Iterations 11


In [43]:
print(logit_model.summary())

                           Logit Regression Results                           
Dep. Variable:        binomial_attack   No. Observations:               125972
Model:                          Logit   Df Residuals:                   125955
Method:                           MLE   Df Model:                           16
Date:                Sat, 30 Jan 2021   Pseudo R-squ.:                  0.8334
Time:                        18:55:35   Log-Likelihood:                -14493.
converged:                       True   LL-Null:                       -87016.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -3.2771      0.063    -51.926      0.000      -3.401      -3.153
dst_host_count                  0.0038      0.000     17.534      0.000       0.

In [44]:
train_final1['pred_prob'] =logit_model.predict(train_final1)

In [45]:
test_final['pred_prob'] =logit_model.predict(test_final)

In [46]:
train_Gini = 2*metrics.roc_auc_score(train_final1.binomial_attack, train_final1.pred_prob)-1
print(train_Gini)

0.9837919439610769


In [47]:
test_Gini = 2*metrics.roc_auc_score(test_final.binomial_attack, test_final.pred_prob)-1
print(test_Gini)

0.8386447189101696


In [48]:
train_final.binomial_attack.count()

125972

In [49]:
roc_df = pd.DataFrame()
for cut_off in np.linspace(0,1):
    #roc_df_temp['cut_off'] = cut_off
    train_final1['y_pred'] = np.where(train_final1.pred_prob>cut_off, 1, 0)
    train_final1['TP'] = np.where(((train_final1.binomial_attack ==1) & (train_final1.y_pred==1)), 1, 0)
    train_final1['TN'] = np.where(((train_final1.binomial_attack ==0) & (train_final1.y_pred==0)), 1, 0)
    train_final1['FP'] = np.where(((train_final1.binomial_attack ==0) & (train_final1.y_pred==1)), 1, 0)
    train_final1['FN'] = np.where(((train_final1.binomial_attack ==1) & (train_final1.y_pred==0)), 1, 0)
    sensitivity = train_final1.TP.sum()/train_final1.binomial_attack.sum()
    specificity = train_final1.TN.sum()/(1-train_final1.binomial_attack).sum()
    accuracy = (train_final1.TN.sum()+train_final1.TP.sum())/train_final1.binomial_attack.count()
    roc_like_table = pd.DataFrame([cut_off, sensitivity, specificity, accuracy]).T
    roc_like_table.columns = ['cutoff', 'sensitivity', 'specificity', 'accuracy']
    roc_df = pd.concat([roc_df, roc_like_table], axis=0)
    

In [50]:
roc_df['total'] = roc_df.sensitivity + roc_df.specificity

In [51]:
roc_df[roc_df.total == roc_df.total.max()]

Unnamed: 0,cutoff,sensitivity,specificity,accuracy,total
0,0.408163,0.96118,0.955199,0.957983,1.916379


In [65]:
train_final1['y_pred'] = np.where(train_final1.pred_prob>0.408, 1, 0)
test_final['y_pred'] = np.where(test_final.pred_prob>0.408, 1, 0)

In [66]:
print(metrics.classification_report(train_final1.binomial_attack, train_final1.y_pred))

              precision    recall  f1-score   support

         0.0       0.97      0.96      0.96     67342
         1.0       0.95      0.96      0.96     58630

    accuracy                           0.96    125972
   macro avg       0.96      0.96      0.96    125972
weighted avg       0.96      0.96      0.96    125972



In [67]:
print(metrics.classification_report(test_final.binomial_attack, test_final.y_pred))

              precision    recall  f1-score   support

           0       0.64      0.91      0.75      9711
           1       0.90      0.60      0.72     12832

    accuracy                           0.74     22543
   macro avg       0.77      0.76      0.74     22543
weighted avg       0.79      0.74      0.73     22543



In [68]:
#Decile Analysis for train data
#Decile Analysis
train_final1['Deciles']=pd.qcut(train_final1['pred_prob'],10, labels=False)
test_final['Deciles']=pd.qcut(test_final['pred_prob'],10, labels=False)

In [69]:
train_final1.head()

Unnamed: 0,Intercept,dst_host_count,dst_bytes,dst_host_rerror_rate,srv_count,dst_host_srv_diff_host_rate,dst_host_same_src_port_rate,service_http,service_private,logged_in,...,src_bytes,binomial_attack,pred_prob,y_pred,TP,TN,FP,FN,Deciles,goods
0,1.0,255.0,0.0,0.0,1.0,0.0,0.88,0.0,0.0,0.0,...,146.0,0.0,0.351892,0,0,1,0,0,5,1.0
1,1.0,255.0,0.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.9998,1,0,0,0,1,8,0.0
2,1.0,30.0,8153.0,0.0,5.0,0.04,0.03,1.0,0.0,1.0,...,232.0,0.0,0.001666,0,0,1,0,0,1,1.0
3,1.0,255.0,420.0,0.0,32.0,0.0,0.0,1.0,0.0,1.0,...,199.0,0.0,0.002032,0,0,1,0,0,2,1.0
4,1.0,255.0,0.0,1.0,19.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.99693,1,0,0,0,1,7,0.0


In [70]:
train_final1['goods'] = 1-train_final1.binomial_attack
test_final['goods'] = 1-test_final.binomial_attack

In [71]:
decile_results_train = train_final1.groupby(['Deciles']).agg(min_prob = ('pred_prob', 'min'),
                              max_prob = ('pred_prob', 'max'),
                              No_bads = ('binomial_attack', 'sum'),
                              No_goods = ('goods', 'sum'), 
                              total = ('binomial_attack', 'count'))

decile_results_test = test_final.groupby(['Deciles']).agg(min_prob = ('pred_prob', 'min'),
                              max_prob = ('pred_prob', 'max'),
                              No_bads = ('binomial_attack', 'sum'),
                              No_goods = ('goods', 'sum'), 
                              total = ('binomial_attack', 'count'),)

In [72]:
decile_results_train

Unnamed: 0_level_0,min_prob,max_prob,No_bads,No_goods,total
Deciles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,7e-06,0.001144,1.0,12597.0,12598
1,0.001144,0.001686,1.0,12596.0,12597
2,0.001686,0.002523,4.0,12593.0,12597
3,0.002523,0.015115,131.0,12466.0,12597
4,0.015118,0.2813,1043.0,11556.0,12599
5,0.281417,0.898043,7672.0,4923.0,12595
6,0.898049,0.989319,12064.0,533.0,12597
7,0.98932,0.999086,12544.0,53.0,12597
8,0.999086,0.999842,12587.0,10.0,12597
9,0.999842,1.0,12583.0,15.0,12598


In [73]:
decile_results_test

Unnamed: 0_level_0,min_prob,max_prob,No_bads,No_goods,total
Deciles,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,5e-05,0.00124,0,2255,2255
1,0.00124,0.001753,1,2253,2254
2,0.001753,0.003109,4,2250,2254
3,0.003113,0.014491,1088,1166,2254
4,0.014495,0.10971,1789,466,2255
5,0.109795,0.315334,1841,413,2254
6,0.315452,0.91037,1991,263,2254
7,0.910412,0.997073,1995,259,2254
8,0.997073,0.999824,1890,364,2254
9,0.999824,1.0,2233,22,2255


In [74]:
decile_results_train.to_csv('decile_results_train.csv')
decile_results_test.to_csv('decile_results_test.csv')

### Multiclass classification

### Machine Learning techniques

In [105]:
train_ml.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,13,0.15,0,255,0.6,0.0,0.88,0.0,0.0,1,...,0,0,0,0,0,0,0,0,1,0
1,123,0.07,0,255,0.05,0.0,0.0,0.1,1.0,26,...,0,0,0,0,1,0,0,0,0,0
2,5,0.0,8153,30,0.0,0.0,0.03,1.0,0.03,255,...,0,0,0,0,0,0,0,0,1,0
3,30,0.0,420,255,0.0,0.0,0.0,1.0,0.0,255,...,0,0,0,0,0,0,0,0,1,0
4,121,0.06,0,255,0.07,1.0,0.0,0.07,0.0,19,...,1,0,0,0,0,0,0,0,0,0


In [106]:
test_ml.head()
missing_cols = set(train_ml.columns) - set(test_ml.columns)
print(missing_cols)

{'service_http_8001', 'service_harvest', 'service_red_i', 'service_urh_i', 'service_aol', 'service_http_2784'}


In [107]:
for col in missing_cols:
    test_ml[col] = 0
test_ml.head()

Unnamed: 0,count,diff_srv_rate,dst_bytes,dst_host_count,dst_host_diff_srv_rate,dst_host_rerror_rate,dst_host_same_src_port_rate,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_count,...,flag_S2,flag_S3,flag_SF,flag_SH,service_http_8001,service_harvest,service_red_i,service_urh_i,service_aol,service_http_2784
0,136,0.06,0.0,255,0.06,1.0,0.0,0.0,0.0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,0.0,134,0.04,0.0,0.61,0.61,0.0,86,...,0,0,1,0,0,0,0,0,0,0
2,1,0.0,0.0,3,0.0,0.0,1.0,1.0,0.0,57,...,0,0,1,0,0,0,0,0,0,0
3,1,0.0,15.0,29,0.17,0.83,0.03,0.31,0.0,86,...,0,0,0,0,0,0,0,0,0,0
4,4,0.0,14515.0,155,0.0,0.0,0.01,1.0,0.01,255,...,0,0,1,0,0,0,0,0,0,0


In [108]:
X_train = train_ml[train_ml.columns.difference(['binomial_attack', 'multinomial_attack'])]
y_train = train_ml.multinomial_attack

X_test = test_ml[test_ml.columns.difference(['binomial_attack', 'multinomial_attack'])]
y_test = test_ml.multinomial_attack

In [110]:
### XGB Classifier

In [111]:
xgb = XGBClassifier(n_estimators=100,
                    max_depth=15, 
                    gamma = 7)
eval_metric = ["mlogloss"]
xgb.fit(X_train, y_train, eval_metric=eval_metric, verbose=True)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=7, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=True,
              validate_parameters=1, verbosity=None)

In [112]:
print(xgb.score(X_train, y_train))
print(xgb.score(X_test, y_test))

0.9993569999682469
0.7971432373685845


In [113]:
xgb_train_pred = xgb.predict(X_train)
print(metrics.classification_report(y_train, xgb_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     67342
           1       1.00      1.00      1.00     45927
           2       1.00      1.00      1.00     11656
           3       0.99      0.98      0.98       995
           4       0.88      0.69      0.77        52

    accuracy                           1.00    125972
   macro avg       0.97      0.93      0.95    125972
weighted avg       1.00      1.00      1.00    125972



In [114]:
xgb_test_pred = xgb.predict(X_test)
print(metrics.classification_report(y_test, xgb_test_pred))

              precision    recall  f1-score   support

           0       0.73      0.96      0.83      9711
           1       0.94      0.87      0.90      7166
           2       0.75      0.75      0.75      2421
           3       0.97      0.20      0.33      2885
           4       0.76      0.07      0.13       360

    accuracy                           0.80     22543
   macro avg       0.83      0.57      0.59     22543
weighted avg       0.83      0.80      0.77     22543



### Random Forest

In [115]:
# Fitting Random Forest Classification to the Training set
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [116]:
rf_train_pred = classifier.predict(X_train)
print(metrics.classification_report(y_train, rf_train_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     67342
           1       1.00      1.00      1.00     45927
           2       1.00      1.00      1.00     11656
           3       1.00      1.00      1.00       995
           4       1.00      1.00      1.00        52

    accuracy                           1.00    125972
   macro avg       1.00      1.00      1.00    125972
weighted avg       1.00      1.00      1.00    125972



In [117]:
rf_test_pred = classifier.predict(X_test)
print(metrics.classification_report(y_test, rf_test_pred))

              precision    recall  f1-score   support

           0       0.66      0.97      0.79      9711
           1       0.96      0.81      0.88      7166
           2       0.82      0.62      0.71      2421
           3       0.97      0.11      0.20      2885
           4       0.64      0.04      0.08       360

    accuracy                           0.76     22543
   macro avg       0.81      0.51      0.53     22543
weighted avg       0.81      0.76      0.72     22543

