In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from random import randint
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from sklearn.metrics import confusion_matrix
import itertools
import matplotlib.pyplot as plt
import seaborn as sns

## Enabling the GPU usage for tensorflow

In [2]:
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
print('Amount of GPUs :', len(gpu_devices))
tf.config.experimental.set_memory_growth(gpu_devices[0], True)

Amount of GPUs : 1


## Importing the datasets
To import the datasets, we need to follow along with the relative folder path.

In [3]:
# Importing train dataset
train_data = pd.read_csv('datasets/kdd19/kddtrain.csv')
# Importing test dataset
test_data = pd.read_csv('datasets/kdd19/kddtest.csv')

## Importing columns to the datasets
Original NSL-KDD datasets do not contain any columns name. Therefore, we need to add the names of the columns.

In [4]:
train_data.columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]
test_data.columns = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","attack", "last_flag"]

## Pre-analysising how many data is contained
- Train dataframe contains the 43 columns which means there are 43 different categories of data fragments. For the data amount, it contains over 125972 rows. Therefore, it contain 125972 data tranfers.
- Columns of the test dataframe is the same as train dataframe but test dataframe contains data tranfers fewer than train datafram, 22542 rows.

In [5]:
pd.set_option("display.max_columns", None)
# train_data.head(3)
test_data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag
0,0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,1,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint,15


In [None]:
train_data.describe()

In [6]:
# dropping the attack columns for TEST DATA
test_data.drop(['attack'], axis = 1, inplace = True)

test_data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag
0,0,tcp,private,REJ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,136,1,0.0,0.0,1.0,1.0,0.01,0.06,0.0,255,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,134,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,15


## Checking dirty data for both datasets
In this steps, we will check the dirty datasets and check what kind of data types are containing in the dataframes.info() function also shows how many memory usage is took in RAM.

In [None]:
# For train data
print('If True, it contains nulls: '+ str(train_data.isnull().values.any())) # this is checking entire dataframe
#train_data.isnull().sum() # if there is NaN, check the sum of NaN
print(train_data.info())

In [None]:
# For test data
print('If True, it contains nulls: '+ str(test_data.isnull().values.any())) # this is checking entire dataframe
#test_data.isnull().sum() # if there is NaN, check the sum of NaN
print(test_data.info())

## Data Preprocessing
After checking the dirty, we will continue data preprocessing to fit in the model of neural network without having any troubles.

### Data Preprocessing for the object data
In this step, we will going to change different types of attack columns to just normal and irregular. <br>
First of all, we need to mapping the attack types to which is irregular or nomral.

In [7]:
train_data['attack'].unique()
map_attacks = {'ipsweep': 'irregular','satan': 'irregular','nmap': 'irregular','portsweep': 'irregular','saint': 'irregular',
'mscan': 'irregular','teardrop': 'irregular','pod': 'irregular','land': 'irregular','back': 'irregular','neptune': 'irregular',
'smurf': 'irregular','mailbomb': 'irregular','udpstorm': 'irregular','apache2': 'irregular','processtable': 'irregular',
'perl': 'irregular','loadmodule': 'irregular','rootkit': 'irregular','buffer_overflow': 'irregular','xterm': 'irregular',
'ps': 'irregular','sqlattack': 'irregular','httptunnel': 'irregular','ftp_write': 'irregular','phf': 'irregular',
'guess_passwd': 'irregular','warezmaster': 'irregular','warezclient': 'irregular','imap': 'irregular','spy': 'irregular',
'multihop': 'irregular','named': 'irregular','snmpguess': 'irregular','worm': 'irregular','snmpgetattack': 'irregular',
'xsnoop': 'irregular','xlock': 'irregular','sendmail': 'irregular','normal': 'normal'}

In [8]:
# adding and renaming the attack columns to traffic and apply data
# For TRAIN DATA
train_data['traffic'] = train_data['attack'].apply(lambda v: map_attacks[v])

# Predisplay data
train_data.head(3)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack,last_flag,traffic
0,0,udp,other,SF,146,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13,1,0.0,0.0,0.0,0.0,0.08,0.15,0.0,255,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15,normal
1,0,tcp,private,S0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,6,1.0,1.0,0.0,0.0,0.05,0.07,0.0,255,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19,irregular
2,0,tcp,http,SF,232,8153,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,5,5,0.2,0.2,0.0,0.0,1.0,0.0,0.0,30,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21,normal


In [None]:
test_data.head(3)

Dropout the unecessary column "attack" because there was new field 'traffic'

In [9]:
# For TRAIN DATA
train_data.drop(['attack'], axis=1, inplace=True)

In [None]:
# display attack modified data
train_data.head(3)
# print(test_data.head(3))

In [None]:
train_data.describe()

In [None]:
train_data.info()

### Changing Numerical Values
Before changing categorical value, the numerical values need to change first. <br>
Therefore, we need to scale the numerical values 

In [10]:
# importing standard scaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler
standard_scaler = StandardScaler()

# extracting numerical value from dataframe
num_cols_train = train_data.select_dtypes(include=['float64','int64']).columns
num_cols_test = test_data.select_dtypes(include=['float64','int64']).columns

# extracting object value from dataframe
obj_cols_train = train_data.select_dtypes(include=['object']).copy()
obj_cols_test = test_data.select_dtypes(include=['object']).copy()

# scaling numeric data 
scaled_num_train = standard_scaler.fit_transform(train_data.select_dtypes(include=['float64','int64']))
scaled_num_test = standard_scaler.fit_transform(test_data.select_dtypes(include=['float64','int64']))

# turn into dataframe
pd_scaled_num_train = pd.DataFrame(scaled_num_train, columns=num_cols_train)
pd_scaled_num_test = pd.DataFrame(scaled_num_test, columns=num_cols_test)

# print(obj_cols_train.shape)
# print(obj_cols_test.shape)

# merging both numeric data and object data
train_data = pd.concat([obj_cols_train, pd_scaled_num_train], axis=1)
test_data = pd.concat([obj_cols_test, pd_scaled_num_test], axis=1)

In [11]:
print(train_data.shape)
train_data.head(3)

(125972, 43)


Unnamed: 0,protocol_type,service,flag,traffic,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag
0,udp,other,SF,normal,-0.11025,-0.007737,-0.004919,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,-0.809267,-0.011664,-0.036652,-0.024437,-0.012385,-0.02618,-0.01861,-0.041221,0.0,-0.002818,-0.097531,-0.620987,-0.368112,-0.637213,-0.631933,-0.374364,-0.374433,-1.32142,0.482196,-0.374561,0.734337,-1.035693,-1.161035,2.736839,2.367729,-0.289105,-0.639535,-0.624874,-0.387635,-0.376389,-1.965547
1,tcp,private,S0,irregular,-0.11025,-0.007762,-0.004919,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,-0.809267,-0.011664,-0.036652,-0.024437,-0.012385,-0.02618,-0.01861,-0.041221,0.0,-0.002818,-0.097531,0.339641,-0.299275,1.602655,1.605095,-0.374364,-0.374433,-1.38966,0.038526,-0.374561,0.734337,-0.809862,-0.938292,-0.174419,-0.480194,-0.289105,1.60875,1.618946,-0.387635,-0.376389,-0.219968
2,tcp,http,SF,normal,-0.11025,-0.007723,-0.002891,-0.014089,-0.089487,-0.007736,-0.095076,-0.027023,1.235686,-0.011664,-0.036652,-0.024437,-0.012385,-0.02618,-0.01861,-0.041221,0.0,-0.002818,-0.097531,-0.690851,-0.313042,-0.189239,-0.184527,-0.374364,-0.374433,0.771288,-0.349685,-0.374561,-1.533667,1.258746,1.066393,-0.439079,-0.383106,0.066249,-0.572087,-0.602436,-0.387635,-0.345086,0.652822


In [12]:
print(test_data.shape)
test_data.head(3)

(22542, 42)


Unnamed: 0,protocol_type,service,flag,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,root_shell,su_attempted,num_root,num_file_creations,num_shells,num_access_files,num_outbound_cmds,is_host_login,is_guest_login,count,srv_count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,diff_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,last_flag
0,tcp,private,REJ,-0.155541,-0.021989,-0.0969,-0.017625,-0.059107,-0.01946,-0.113526,-0.144005,-0.890443,-0.016494,-0.049456,-0.012638,-0.01426,-0.012912,-0.024022,-0.05232,0.0,-0.022096,-0.171079,0.443282,-0.338255,-0.348485,-0.347407,1.830265,1.837729,-1.770871,-0.131433,-0.386982,0.650134,-1.250339,-1.397343,-0.138291,-0.431809,-0.229991,-0.358135,-0.352767,1.979974,1.929264,0.698331
1,tcp,ftp_data,SF,-0.15412,0.005471,-0.0969,-0.017625,-0.059107,-0.01946,-0.113526,-0.144005,-0.890443,-0.016494,-0.049456,-0.012638,-0.01426,-0.012912,-0.024022,-0.05232,0.0,-0.022096,-0.171079,-0.60699,-0.338255,-0.348485,-0.347407,-0.573045,-0.565019,0.629389,-0.362976,-0.386982,-0.636607,-0.489913,0.002818,-0.228911,1.56002,0.004213,-0.358135,-0.352767,-0.602648,-0.565448,0.698331
2,icmp,eco_i,SF,-0.155541,-0.021947,-0.0969,-0.017625,-0.059107,-0.01946,-0.113526,-0.144005,-0.890443,-0.016494,-0.049456,-0.012638,-0.01426,-0.012912,-0.024022,-0.05232,0.0,-0.022096,-0.171079,-0.60699,0.380328,-0.348485,-0.347407,-0.573045,-0.565019,0.629389,-0.362976,3.557025,-2.029691,-0.749353,0.898002,-0.410152,2.833484,3.048867,-0.358135,-0.352767,-0.602648,-0.565448,-0.706934


### Finding missing values in test data

In test datafram, some data are missing ('harvest', 'http_2784', 'urh_i', 'tftp_u', 'aol', 'red_i', 'http_8001').

In [20]:
print("---------- For Train data ----------")
print("Protocol types are " + str(train_data['protocol_type'].unique()) + ". And total: " + str(train_data['protocol_type'].nunique()))
print("Services are " + str(train_data['service'].unique()) + ". And total: " + str(train_data['service'].nunique()))
print("Flags are " + str(train_data['flag'].unique()) + ". And total: " + str(train_data['flag'].nunique()))
print()
print("---------- For Test data ----------")
print("Protocol types are " + str(test_data['protocol_type'].unique()) + ". And total: " + str(test_data['protocol_type'].nunique()))
print("Services are " + str(test_data['service'].unique()) + ". And total: " + str(test_data['service'].nunique()))
print("Flags are " + str(test_data['flag'].unique()) + ". And total: " + str(test_data['flag'].nunique()))

---------- For Train data ----------
Protocol types are ['udp' 'tcp' 'icmp']. And total: 3
Services are ['other' 'private' 'http' 'remote_job' 'ftp_data' 'name' 'netbios_ns'
 'eco_i' 'mtp' 'telnet' 'finger' 'domain_u' 'supdup' 'uucp_path' 'Z39_50'
 'smtp' 'csnet_ns' 'uucp' 'netbios_dgm' 'urp_i' 'auth' 'domain' 'ftp'
 'bgp' 'ldap' 'ecr_i' 'gopher' 'vmnet' 'systat' 'http_443' 'efs' 'whois'
 'imap4' 'iso_tsap' 'echo' 'klogin' 'link' 'sunrpc' 'login' 'kshell'
 'sql_net' 'time' 'hostnames' 'exec' 'ntp_u' 'discard' 'nntp' 'courier'
 'ctf' 'ssh' 'daytime' 'shell' 'netstat' 'pop_3' 'nnsp' 'IRC' 'pop_2'
 'printer' 'tim_i' 'pm_dump' 'red_i' 'netbios_ssn' 'rje' 'X11' 'urh_i'
 'http_8001' 'aol' 'http_2784' 'tftp_u' 'harvest']. And total: 70
Flags are ['SF' 'S0' 'REJ' 'RSTR' 'SH' 'RSTO' 'S1' 'RSTOS0' 'S3' 'S2' 'OTH']. And total: 11

---------- For Test data ----------
Protocol types are ['tcp' 'icmp' 'udp']. And total: 3
Services are ['private' 'ftp_data' 'eco_i' 'telnet' 'http' 'smtp' 'ftp' 'ldap'

In [18]:
train_service = train_data['service'].tolist()
test_service = test_data['service'].tolist()
difference=list(set(train_service) - set(test_service))
difference

['harvest', 'http_2784', 'urh_i', 'tftp_u', 'aol', 'red_i', 'http_8001']

In [28]:
# after get dummies
for col in difference:
    obj_cols_test[col] = 0

obj_cols_test.head(3)

Unnamed: 0,protocol_type,service,flag,harvest,http_2784,urh_i,tftp_u,aol,red_i,http_8001
0,tcp,private,REJ,0,0,0,0,0,0,0
1,tcp,ftp_data,SF,0,0,0,0,0,0,0
2,icmp,eco_i,SF,0,0,0,0,0,0,0


### Diplaying Dimensions and columns

In [None]:
print(train_data.shape)
train_data.head(3)

In [None]:
print(test_data.shape)
test_data.head(3)

In [None]:
# shifting to last columns 'traffic'
target_column = train_data.pop('traffic')
train_data.insert(123, 'traffic', target_column)
train_data.head(3)

In [None]:
# now we have only one object which is 'traffic'
train_data.info()
test_data.info()

Now change the traffic object data to <b>normal = 0 and irregular =  1</b> values.

In [None]:
map_traffic = {'normal': '0','irregular': '1'}
train_data['traffic'] = train_data['traffic'].apply(lambda v: map_traffic[v])
train_data.head(3)

### Cleaing unecessary columns 
num_outbound_cmds are all 0 data in both train and test data. Therefore, drop out all the data of num_outbound_cmds

In [None]:
# print(train_data['num_outbound_cmds'].value_counts())
# print(test_data['num_outbound_cmds'].value_counts())
# train_data.shape
test_data.shape

In [None]:
train_data.drop(['num_outbound_cmds'], axis=1, inplace=True)
test_data.drop(['num_outbound_cmds'], axis=1, inplace=True)

In [None]:
# correlation
# corr = train_data.corr()
# plt.figure(figsize=(15,12))
# sns.heatmap(corr)
# plt.show()

In [None]:
print(train_data.shape)
print(test_data.shape)