In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Importing required libraries

In [None]:
import pandas as pd
import numpy as np

Loading train and test data to a dataframe

In [None]:
feature_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","target","severity"]

In [None]:
df_train = pd.read_csv("/content/drive/Shareddrives/kdd dataset/KDDTrain+.txt",sep=",", names=feature_names)
df_test = pd.read_csv("/content/drive/Shareddrives/kdd dataset/KDDTest+.txt",sep=",",names=feature_names)
df_train.drop(['severity'],axis=1,inplace=True)
df_test.drop(['severity'],axis=1,inplace=True)

In [None]:
print("number of attackes in training set:")
print(len(df_train['target'].value_counts())-1)

print("number of attacks in test set")
print(len(df_test['target'].value_counts())-1)

number of attackes in training set:
22
number of attacks in test set
37


Normalization for numeric features

In [None]:
numeric_cols=df_train.select_dtypes(include='number').columns

In [None]:
numeric_cols

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [None]:
df_train[numeric_cols]=StandardScaler().fit_transform(df_train[numeric_cols])

Principal Component Analysis

There are total 41 features to determine whether a particular traffic is normal or attack. Instead of using all 41 features, we can use only most prominet features to train the model. We choose PCA to idenfity most prominet features. 

In [None]:
from sklearn.decomposition import PCA

In [None]:
attrList = numeric_cols

n_components = len(attrList)
pca = PCA(n_components=n_components)
reduced = pca.fit_transform(df_train[attrList])
eigenvalues = pca.explained_variance_
# considering first 9 PCs
es=0
for p in range(0,7):
 es=es+pca.explained_variance_[p]
#print('eigenvaluessum:',es)
cos2sum =[0]*7
for row in range(0,len(pca.components_)):
 for col in range(0,7):
  cos2sum[col]=cos2sum[col]+pca.components_[col,row]*pca.components_[col,row]
#print('cos2sum :',cos2sum)
#commulative contribution
cc=[]
for i in range(0,len(pca.components_)):
 s=0
 for j in range(0,7):
  cos2=pca.components_[j,i]*pca.components_[j,i]
  contribution_per=(cos2*100)/cos2sum[j]
  s=s+contribution_per*eigenvalues[j]
 commulative_contribution=s/es
 cc.append(commulative_contribution)
ziplist=zip(cc,attrList)
sortpairs=sorted(ziplist)
tup=zip(*sortpairs)
result1, result2=[list(t) for t in tup]

for i in range(len(result1)):
  print('{:20}:{:20}'.format(result2[i], result1[i]))

num_outbound_cmds   :                 0.0
is_host_login       :0.0002761880603804862
dst_bytes           :0.010479295088850578
src_bytes           :0.025275926474919646
num_failed_logins   : 0.08947476339906206
land                : 0.10630447341316875
urgent              : 0.17257237673161802
num_shells          : 0.22241472258444805
wrong_fragment      :  0.2381798663125313
num_file_creations  :  0.5393449082649621
diff_srv_rate       :   1.136906588533838
duration            :  1.2068842315943156
srv_diff_host_rate  :   2.004176980836101
dst_host_diff_srv_rate:    2.27906516191657
num_access_files    :  2.3557876102382593
root_shell          :   2.537414222555326
dst_host_count      :   2.896640664541487
dst_host_srv_diff_host_rate:   2.929099762373895
su_attempted        :   3.208252529146551
dst_host_same_src_port_rate:  3.4374038893620753
count               :  3.5082464019042074
logged_in           :  3.5466540320795317
dst_host_srv_count  :    3.77980883908939
dst_host_same_srv

In [None]:
print(result2)

['num_outbound_cmds', 'is_host_login', 'dst_bytes', 'src_bytes', 'num_failed_logins', 'land', 'urgent', 'num_shells', 'wrong_fragment', 'num_file_creations', 'diff_srv_rate', 'duration', 'srv_diff_host_rate', 'dst_host_diff_srv_rate', 'num_access_files', 'root_shell', 'dst_host_count', 'dst_host_srv_diff_host_rate', 'su_attempted', 'dst_host_same_src_port_rate', 'count', 'logged_in', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'srv_count', 'same_srv_rate', 'dst_host_rerror_rate', 'num_compromised', 'num_root', 'hot', 'is_guest_login', 'dst_host_srv_rerror_rate', 'rerror_rate', 'srv_rerror_rate', 'serror_rate', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']


Considering top 20 features for training the model

In [None]:
selected_features=['su_attempted', 'dst_host_same_src_port_rate', 'count', 'logged_in', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'srv_count', 'same_srv_rate', 'dst_host_rerror_rate', 'num_compromised', 'num_root', 'hot', 'is_guest_login', 'dst_host_srv_rerror_rate', 'rerror_rate', 'srv_rerror_rate', 'serror_rate', 'srv_serror_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']

Three categorical features are present in the dataset. Machine learning models cannot be trained on the categorical features, performing one hot encoding for such features in dataset.

In [None]:
from sklearn.preprocessing import OneHotEncoder 


In [None]:
cat_features=['protocol_type','flag']

cat_df=df_train[cat_features]

In [None]:
encoder = OneHotEncoder(handle_unknown='ignore')

#perform one-hot encoding on 'service' column 
encoder_df = pd.DataFrame(encoder.fit_transform(df_train[['service']]).toarray())

In [None]:
encoder_df_test=pd.DataFrame(encoder.transform(df_test[['service']]).toarray())

In [None]:
cat_df=pd.get_dummies(cat_df,columns=cat_features)
cat_df=cat_df.join(encoder_df)
cat_df.head()

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,...,60,61,62,63,64,65,66,67,68,69
0,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0,0,0,0,0,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Seperate encoding for test dataset

In [None]:
cat_df_test=df_test[cat_features]
cat_df_test=pd.get_dummies(cat_df_test,columns=cat_features)
cat_df_test=cat_df_test.join(encoder_df_test)
cat_df_test.head()

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,flag_OTH,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,...,60,61,62,63,64,65,66,67,68,69
0,0,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0,0,1,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1,0,0,0,1,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dimensions of categorical features of test data set after performing one hot encoding

In [None]:
cat_df_test.shape

(22544, 84)

Binary classification labeling


In [None]:
binary_target=pd.DataFrame(df_train['target'].map(lambda x:'usual' if x=='normal' else 'unusual'))
binary_target['target'].value_counts()

usual      67343
unusual    58630
Name: target, dtype: int64

In [None]:
binaryTarget_df=df_train.copy()
binaryTarget_df['target']=binary_target
def tonumeric(df):
   i = df['target']
   if i not in label:
     label[i] = len(label)
   df['target']=label[i]
   return df
label=dict()
binaryTarget_df=binaryTarget_df.apply(tonumeric,axis=1)
y_train=binaryTarget_df['target']

Saving the file

In [None]:
binaryTarget_df=df_train[selected_features].join(cat_df)
binaryTarget_df['target']=y_train
binaryTarget_df.to_csv("/content/drive/Shareddrives/kdd dataset/binaryTraget_df.csv",index=False)

In [None]:
binaryTarget_df.columns

Index([               'su_attempted', 'dst_host_same_src_port_rate',
                             'count',                   'logged_in',
                'dst_host_srv_count',      'dst_host_same_srv_rate',
                         'srv_count',               'same_srv_rate',
              'dst_host_rerror_rate',             'num_compromised',
       ...
                                  61,                            62,
                                  63,                            64,
                                  65,                            66,
                                  67,                            68,
                                  69,                      'target'],
      dtype='object', length=105)

In [None]:
binaryTarget_df['target'].value_counts()

0    67343
1    58630
Name: target, dtype: int64

Multi Class classification labeling
Multi class classifier detects multiple attacks in network. 

In [None]:
df_train['target'].value_counts()

normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: target, dtype: int64

Grouping target to four attacks

In [None]:
df_train['target'].replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'DOS',inplace=True)
df_train['target'].replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
df_train['target'].replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'probe',inplace=True)
df_train['target'].replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)

In [None]:
df_train['target'].value_counts()

normal    67343
DOS       45927
probe     11656
R2L         995
U2R          52
Name: target, dtype: int64

Saving with target names

In [None]:
df_train['target'].to_csv("/content/drive/Shareddrives/kdd dataset/multiClassWithLabel.csv",index=False)

Grouping test target to four attacks

In [None]:
df_test['target'].replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'DOS',inplace=True)
df_test['target'].replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail','snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
df_test['target'].replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'probe',inplace=True)
df_test['target'].replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)

In [None]:
df_test['target'].value_counts()

normal    9711
DOS       7460
R2L       2885
probe     2421
U2R         67
Name: target, dtype: int64

In [None]:
from sklearn import preprocessing

Numeric conversion for `target` 
normal-3, DOS-0,R2L-1,probe-4,U2R-2



In [None]:
le = preprocessing.LabelEncoder()

In [None]:
le.fit(df_test['target'])
df_test['target_encoded']=le.transform(df_test['target'])
df_train['target_encoded']=le.transform(df_train['target'])

In [None]:
df_train['target_encoded'].value_counts()

3    67343
0    45927
4    11656
1      995
2       52
Name: target_encoded, dtype: int64

In [None]:
df_test['target_encoded'].value_counts()

3    9711
0    7460
1    2885
4    2421
2      67
Name: target_encoded, dtype: int64

In [None]:
df_test['target'].to_csv("/content/drive/Shareddrives/kdd dataset/multiClassWithLabel_test.csv",index=False)

In [None]:
df_test['target_encoded'].to_csv("/content/drive/Shareddrives/kdd dataset/multiClassWithLabel_test_encoded.csv",index=False)

In [None]:
df_train['target_encoded'].to_csv("/content/drive/Shareddrives/kdd dataset/multiClassWithLabel_train_encoded.csv",index=False)

# **One hot encoding label**

In [None]:
ohe_target=pd.get_dummies(df_train['target'])
multiclass_df=df_train.loc[:, df_train.columns != 'target'][numeric_cols]
multiclass_df=multiclass_df.join(cat_df)
multiclass_df=multiclass_df.join(ohe_target)
multiclass_df.head()

Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,65,66,67,68,69,DOS,R2L,U2R,normal,probe
0,-0.110249,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
1,-0.110249,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
2,-0.110249,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,-0.809262,-0.011664,...,0.0,0.0,0.0,0.0,0.0,1,0,0,0,0
3,-0.110249,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0
4,-0.110249,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,-0.027023,1.235694,-0.011664,...,0.0,0.0,0.0,0.0,0.0,0,0,0,1,0


In [None]:
multiclass_df.shape

(125973, 127)

Saving files

In [None]:
multiclass_df.to_csv("/content/drive/Shareddrives/kdd dataset/multiclass_df.csv",index=False)

Processing test dataset

In [None]:
processed_test_df=df_test[selected_features].join(cat_df_test)

In [None]:
df_test['target'].value_counts()

normal    9711
DOS       7460
R2L       2885
probe     2421
U2R         67
Name: target, dtype: int64

In [None]:
binary_target_test=pd.DataFrame(df_test['target'].map(lambda x:'usual' if x=='normal' else 'unusual'))
binary_target_test['target'].value_counts()

unusual    12833
usual       9711
Name: target, dtype: int64

In [None]:
target_test=pd.DataFrame(binary_target_test['target'].map(lambda x: 0 if x=='usual' else 1))

In [None]:
target_test.value_counts()

target
1         12833
0          9711
dtype: int64

In [None]:
binaryTarget_test_df=processed_test_df.copy()
binaryTarget_test_df['target']=binary_target_test
def tonumeric(df):
   i = df['target']
   if i not in label:
     label[i] = len(label)
   df['target']=label[i]
   return df
label=dict()
binaryTarget_test_df=binaryTarget_test_df.apply(tonumeric,axis=1)

binary encoding for test data set

In [None]:
target_test['target'].value_counts()

1    12833
0     9711
Name: target, dtype: int64

In [None]:
processed_test_df['target']=target_test

In [None]:
processed_test_df.to_csv("/content/drive/Shareddrives/kdd dataset/processed_test_df.csv",index=False)

In [None]:
binaryTarget_test_df.to_csv("/content/drive/Shareddrives/kdd dataset/binaryTarget_test_df.csv",index=False)