In [None]:
# Forcing autoreload of modules so restart is not required
%load_ext autoreload
%autoreload 2

import os

os.chdir("../../")


print("Current Working Directory:  ", os.getcwd())

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
train = pd.read_csv("./data/staging/NSL_KDD/train.csv")
test = pd.read_csv("./data/staging/NSL_KDD/test.csv")

In [4]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [5]:
print(train['41'].unique())
print(test['41'].unique())

['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep' 'teardrop' 'nmap'
 'satan' 'smurf' 'pod' 'back' 'guess_passwd' 'ftp_write' 'multihop'
 'rootkit' 'buffer_overflow' 'imap' 'warezmaster' 'phf' 'land'
 'loadmodule' 'spy' 'perl']
['neptune' 'normal' 'saint' 'mscan' 'guess_passwd' 'smurf' 'apache2'
 'satan' 'buffer_overflow' 'back' 'warezmaster' 'snmpgetattack'
 'processtable' 'pod' 'httptunnel' 'nmap' 'ps' 'snmpguess' 'ipsweep'
 'mailbomb' 'portsweep' 'multihop' 'named' 'sendmail' 'loadmodule' 'xterm'
 'worm' 'teardrop' 'rootkit' 'xlock' 'perl' 'land' 'xsnoop' 'sqlattack'
 'ftp_write' 'imap' 'udpstorm' 'phf']


In [6]:
print(len(train))
print(len(test))

125973
22544


In [7]:
valid_labels = ['normal', 'neptune', 'smurf', 'teardrop', 'pod', 'land', 'back']

In [8]:
train = train[train['41'].isin(valid_labels)]
test = test[test['41'].isin(valid_labels)]

In [9]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [10]:
print(len(train))
print(len(test))

113270
15452


In [11]:
print(train['41'].unique())
print(test['41'].unique())

['normal' 'neptune' 'teardrop' 'smurf' 'pod' 'back' 'land']
['neptune' 'normal' 'smurf' 'back' 'pod' 'teardrop' 'land']


In [12]:
dos_attacks = ['neptune', 'smurf', 'teardrop', 'pod', 'land', 'back']

train['label'] = train['41'].map(lambda x: 0 if x == 'normal' else 1 if x in dos_attacks else None)
test['label'] = test['41'].map(lambda x: 0 if x == 'normal' else 1 if x in dos_attacks else None)

In [13]:
train.tail(500)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,label
125437,0,tcp,http,SF,251,272,0,0,0,0,...,0.01,0.00,0.00,0.00,0.0,0.00,0.0,normal,21,0
125439,0,tcp,csnet_ns,S0,0,0,0,0,0,0,...,0.05,0.00,0.00,1.00,1.0,0.00,0.0,neptune,19,1
125441,31,tcp,ftp,SF,1205,3380,0,0,0,24,...,0.02,0.00,0.00,0.00,0.0,0.00,0.0,normal,19,0
125442,0,tcp,ftp,RSTO,0,0,0,0,0,0,...,0.07,0.00,0.00,0.00,0.0,1.00,1.0,neptune,20,1
125443,0,tcp,http,SF,234,305,0,0,0,0,...,0.00,0.00,0.01,0.00,0.0,0.00,0.0,normal,21,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,0.06,0.00,0.00,1.00,1.0,0.00,0.0,neptune,20,1
125969,8,udp,private,SF,105,145,0,0,0,0,...,0.01,0.01,0.00,0.00,0.0,0.00,0.0,normal,21,0
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.06,0.00,0.00,0.72,0.0,0.01,0.0,normal,18,0
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.05,0.00,0.00,1.00,1.0,0.00,0.0,neptune,20,1


In [14]:
print(len(train))
print(len(test))

113270
15452


In [19]:
print(train.shape)
print(test.shape)

(113270, 44)
(15452, 44)


In [20]:
extra_cols_test = set(test.columns) - set(train.columns)
missing_cols_train = set(train.columns) - set(test.columns)

print("Columns in test but not in train:", extra_cols_test)
print("Columns in train but not in test:", missing_cols_train)


Columns in test but not in train: set()
Columns in train but not in test: set()


In [None]:
test = test[train.columns]

In [22]:
print(train.shape)
print(test.shape)

(113270, 44)
(15452, 44)


In [23]:
extra_cols_test = set(test.columns) - set(train.columns)
missing_cols_train = set(train.columns) - set(test.columns)

print("Columns in test but not in train:", extra_cols_test)
print("Columns in train but not in test:", missing_cols_train)


Columns in test but not in train: set()
Columns in train but not in test: set()


In [24]:
X_train, Y_train = train.drop(columns=['label']), train['label']
X_test, Y_test = test.drop(columns=['label']), test['label']

In [25]:
type(Y_train)

pandas.core.series.Series

In [26]:
label_encoder_train = LabelEncoder()
Y_train = label_encoder_train.fit_transform(Y_train)


Y_test = label_encoder_train.transform(Y_test)

In [27]:
Y_train

array([0, 0, 1, ..., 0, 1, 0], shape=(113270,))

In [28]:
Y_test

array([1, 1, 0, ..., 0, 1, 0], shape=(15452,))

In [29]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [30]:
print(train.dtypes.value_counts())
print(test.dtypes.value_counts())

int64      25
float64    15
object      4
Name: count, dtype: int64
int64      25
float64    15
object      4
Name: count, dtype: int64


In [31]:
print(train.select_dtypes(exclude=np.number).columns)
print(test.select_dtypes(exclude=np.number).columns)

Index(['1', '2', '3', '41'], dtype='object')
Index(['1', '2', '3', '41'], dtype='object')


In [32]:
categories_col = ['1', '2', '3','41']
num_col_train = list(set(X_train.columns) - set(categories_col))
num_col_test = list(set(X_test.columns) - set(categories_col))

In [33]:
print(train.dtypes.value_counts())


int64      25
float64    15
object      4
Name: count, dtype: int64


In [34]:
scaler = StandardScaler()
scaler.fit(X_train[num_col_train])

In [None]:
X_train[num_col_train] = scaler.transform(X_train[num_col_train])
X_test[num_col_test] = scaler.transform(X_test[num_col_test])

In [36]:
X_test.columns

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24',
       '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36',
       '37', '38', '39', '40', '41', '42'],
      dtype='object')

In [37]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,-0.099319,tcp,ftp_data,SF,-0.024169,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,-0.815526,-0.194792,0.333144,-0.283192,-0.679073,-0.666814,-0.149019,-0.318156,normal,0.054241
1,-0.099319,udp,other,SF,-0.025239,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,-1.197734,5.154412,3.382718,-0.283192,-0.679073,-0.666814,-0.328182,-0.318156,normal,-2.915818
2,-0.099319,tcp,private,S0,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,-0.972906,-0.0071,-0.397035,-0.283192,1.500671,1.516041,-0.328182,-0.318156,neptune,-0.539771
3,-0.099319,tcp,http,SF,-0.024973,0.109066,-0.014858,-0.094413,-0.006644,-0.085596,...,1.050552,-0.476329,-0.26818,0.419381,-0.613681,-0.644986,-0.328182,-0.281967,normal,0.648253
4,-0.099319,tcp,http,SF,-0.025075,-0.043996,-0.014858,-0.094413,-0.006644,-0.085596,...,1.050552,-0.476329,-0.397035,-0.283192,-0.679073,-0.666814,-0.328182,-0.318156,normal,0.648253


In [38]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,-0.099319,tcp,private,REJ,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,-1.107803,0.086745,-0.397035,-0.283192,-0.679073,-0.666814,3.255073,3.300722,neptune,0.648253
1,-0.099319,tcp,private,REJ,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,-1.197734,0.086745,-0.397035,-0.283192,-0.679073,-0.666814,3.255073,3.300722,neptune,0.648253
2,-0.097338,tcp,ftp_data,SF,0.014568,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,...,0.17372,-0.100946,2.223021,0.068094,-0.679073,-0.666814,-0.328182,-0.318156,normal,0.648253
5,-0.099319,tcp,http,SF,-0.024864,0.23499,-0.014858,-0.094413,-0.006644,-0.085596,...,1.050552,-0.476329,-0.354084,0.243737,-0.657276,-0.666814,-0.328182,-0.318156,normal,0.648253
6,-0.099319,tcp,smtp,SF,-0.022523,-0.044649,-0.014858,-0.094413,-0.006644,-0.085596,...,-0.950423,6.280561,-0.397035,-0.283192,-0.679073,-0.666814,2.251762,-0.173401,normal,0.648253


In [39]:
X_train.drop('41',axis=1,inplace=True)
X_test.drop('41',axis=1,inplace=True)

In [None]:
service_ohe = OneHotEncoder(handle_unknown="ignore")
proto_ohe = OneHotEncoder(handle_unknown="ignore")
flag_ohe = OneHotEncoder(handle_unknown="ignore")

ohe_service_train = service_ohe.fit(X_train[['2']])
ohe_proto_train = proto_ohe.fit(X_train[['1']])
ohe_flag_train = flag_ohe.fit(X_train[['3']])

X_train_service = pd.DataFrame(ohe_service_train.transform(X_train[['2']]).todense(), 
							   columns=ohe_service_train.get_feature_names_out(['2']))
X_train_proto = pd.DataFrame(ohe_proto_train.transform(X_train[['1']]).todense(), 
							 columns=ohe_proto_train.get_feature_names_out(['1']))
X_train_state = pd.DataFrame(ohe_flag_train.transform(X_train[['3']]).todense(), 
							 columns=ohe_flag_train.get_feature_names_out(['3']))

X_train = X_train.drop(['2', '1', '3'], axis=1)
X_train = pd.concat([X_train, X_train_service, X_train_proto, X_train_state], axis=1)

X_test_service = pd.DataFrame(ohe_service_train.transform(X_test[['2']]).todense(), 
							  columns=ohe_service_train.get_feature_names_out(['2']))
X_test_proto = pd.DataFrame(ohe_proto_train.transform(X_test[['1']]).todense(), 
							columns=ohe_proto_train.get_feature_names_out(['1']))
X_test_state = pd.DataFrame(ohe_flag_train.transform(X_test[['3']]).todense(), 
							columns=ohe_flag_train.get_feature_names_out(['3']))

X_test = X_test.drop(['2', '1', '3'], axis=1)
X_test = pd.concat([X_test, X_test_service, X_test_proto, X_test_state], axis=1)

In [41]:
X_train.head()

Unnamed: 0,0,4,5,6,7,8,9,10,11,12,...,3_OTH,3_REJ,3_RSTO,3_RSTR,3_S0,3_S1,3_S2,3_S3,3_SF,3_SH
0,-0.099319,-0.024169,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.099319,-0.025239,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.099319,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,-0.099319,-0.024973,0.109066,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,1.149104,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.099319,-0.025075,-0.043996,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,1.149104,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [42]:
X_test.head()

Unnamed: 0,0,4,5,6,7,8,9,10,11,12,...,3_OTH,3_REJ,3_RSTO,3_RSTR,3_S0,3_S1,3_S2,3_S3,3_SF,3_SH
0,-0.099319,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.099319,-0.025692,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.097338,0.014568,-0.052309,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,-0.870243,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,-0.099319,-0.024864,0.23499,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,1.149104,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6,-0.099319,-0.022523,-0.044649,-0.014858,-0.094413,-0.006644,-0.085596,-0.021517,1.149104,-0.012249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape)

Columns in X_train but not in X_test: set()
Columns in X_test but not in X_train: set()
(124672, 117) (20330, 117)


In [44]:
Y_train = pd.DataFrame(Y_train, columns=['class'])
Y_test = pd.DataFrame(Y_test, columns=['class'])

In [45]:
Y_train['class'].unique()

array([0, 1])

In [46]:
Y_test['class'].unique()

array([1, 0])

In [None]:
X_train = X_train.dropna()
Y_train = Y_train.loc[X_train.index]

X_test = X_test.dropna()
Y_test = Y_test.loc[X_test.index]

In [None]:
x_train = set(X_train.columns.tolist())
x_test = set(X_test.columns.tolist())

diff_train = x_train - x_test
diff_test = x_test - x_train

print("Columns in X_train but not in X_test:", diff_train)
print("Columns in X_test but not in X_train:", diff_test)

print(X_train.shape, X_test.shape)

Columns in X_train but not in X_test: set()
Columns in X_test but not in X_train: set()
(101868, 117) (10574, 117)


In [51]:
X_train.to_csv('./data/processed/NSL_KDD/binary/X_train.csv', index=False)
Y_train.to_csv('./data/processed/NSL_KDD/binary/Y_train.csv', index=False)

In [52]:
X_test.to_csv('./data/processed/NSL_KDD/binary/X_test.csv', index=False)
Y_test.to_csv('./data/processed/NSL_KDD/binary/Y_test.csv', index=False)