# Intrusion Detection System

- In this project we will be trying to evaluate the performance of Shallow Neural Networks on the `CSE-CIC-IDS2018 on AWS` dataset
- Here we are working on a curated smaller dataset due to computational limitations
- We have created a smaller subset of the overall dataset with a total 1.2 lakhs+ records

In [1]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

### Getting to know our data

- After importing our data we will seperate feature columns to form the X and Y labels
- Then we will use `sklearn` library functions to split data into training and testing sets

In [2]:
df=pd.read_csv('./Final_Dataset.csv')
df.head()

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,21,6,14-02-2018 10:33,19,1,1,0,0,0,0,...,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FTP-BruteForce
1,21,6,14-02-2018 10:33,3,1,1,0,0,0,0,...,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FTP-BruteForce
2,21,6,14-02-2018 10:33,3,1,1,0,0,0,0,...,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FTP-BruteForce
3,21,6,14-02-2018 10:33,2,1,1,0,0,0,0,...,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FTP-BruteForce
4,21,6,14-02-2018 10:33,2,1,1,0,0,0,0,...,40,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,FTP-BruteForce


### Droping the Timestamp column

- As we will not be performing the prediction with RNN, the timestamp feature serves no purpose in the current model
- Timestamp is generally used to analyse patterns in `sequence of data` or `stream of tokens`
- Here as we are fixing ourselves to Shallow Neural Networks, we are not utilizing the `Timestamp` feature

In [3]:
data=df.drop(['Timestamp'],axis=1)
data

Unnamed: 0,Dst Port,Protocol,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,21,6,19,1,1,0,0,0,0,0.000000,...,40,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,FTP-BruteForce
1,21,6,3,1,1,0,0,0,0,0.000000,...,40,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,FTP-BruteForce
2,21,6,3,1,1,0,0,0,0,0.000000,...,40,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,FTP-BruteForce
3,21,6,2,1,1,0,0,0,0,0.000000,...,40,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,FTP-BruteForce
4,21,6,2,1,1,0,0,0,0,0.000000,...,40,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,FTP-BruteForce
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123367,3389,6,119959166,110,110,0,5610,0,0,0.000000,...,20,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,Benign
123368,3389,17,119993757,9,10,344,516,130,12,38.222222,...,8,237932.25,44112.46359,299482.0,164835.0,14800000.0,110176.4623,14900000.0,14600000.0,Benign
123369,445,6,335905,3,1,0,0,0,0,0.000000,...,20,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,Benign
123370,3389,6,2145983,8,7,1128,1581,661,0,141.000000,...,20,0.00,0.00000,0.0,0.0,0.0,0.0000,0.0,0.0,Benign


In [4]:
X=data.iloc[:,0:-1]
Y=data.iloc[:,-1].astype('category').cat.codes

In [63]:
Y[4]

10

### Cleaning Data

- Cleaning data is one of the most essential steps for proper functioning of the model
- This includes replacing NaN and Infinity values with other meaning full values
- The values used to replace these are chosen based on application to preserve data integrity
- Here we will be replacing them with `0` as we want the `inf` and `nan` values to be omitted during the calculation of weights and biases

In [5]:
x=pd.DataFrame(X.isna().sum())
x[15:]

Unnamed: 0,0
Flow Byts/s,80
Flow Pkts/s,0
Flow IAT Mean,0
Flow IAT Std,0
Flow IAT Max,0
...,...
Active Min,0
Idle Mean,0
Idle Std,0
Idle Max,0


In [6]:
x=pd.DataFrame(X.isin([np.inf]).sum())
x[15:]

Unnamed: 0,0
Flow Byts/s,50
Flow Pkts/s,130
Flow IAT Mean,0
Flow IAT Std,0
Flow IAT Max,0
...,...
Active Min,0
Idle Mean,0
Idle Std,0
Idle Max,0


In [7]:
X.fillna(value=0,inplace=True)
X.replace([np.inf],0,inplace=True)

In [8]:
x=pd.DataFrame(X.isna().sum())
x[15:]

Unnamed: 0,0
Flow Byts/s,0
Flow Pkts/s,0
Flow IAT Mean,0
Flow IAT Std,0
Flow IAT Max,0
...,...
Active Min,0
Idle Mean,0
Idle Std,0
Idle Max,0


In [9]:
x=pd.DataFrame(X.isin([np.inf]).sum())
x[15:]

Unnamed: 0,0
Flow Byts/s,0
Flow Pkts/s,0
Flow IAT Mean,0
Flow IAT Std,0
Flow IAT Max,0
...,...
Active Min,0
Idle Mean,0
Idle Std,0
Idle Max,0


### Train test split

- Now that our data is clean, we will perform the train-cv-test split
- Unlike normal applications, we divide our data into 3 parts instead of 2 parts
- This is done to ensure that the hyper parameter tuning can be kept isolated from the test data

In [10]:
X=np.array(X)
Y=np.array(Y)

In [11]:
X_train,X_,Y_train,Y_=train_test_split(X,Y,test_size=0.4,random_state=1)
X_cv,X_test,Y_cv,Y_test=train_test_split(X_,Y_,test_size=0.5,random_state=1)
print(X_train.shape,Y_train.shape)
print(X_cv.shape,Y_cv.shape)
print(X_test.shape,Y_test.shape)

(74023, 78) (74023,)
(24674, 78) (24674,)
(24675, 78) (24675,)


### Model Acrhitecture

- Dense layer with 1024 units and activation `relu`
- Dense layer with 1024 units and activation `relu`
- Dense layer with 128 units and activation `relu`
- Dense layer with 10 units and activation `softmax`

### Compile metrics

- Used metrics are:
    - Accuracy
- Optimizer used:
    - Adam
- Cost Function used:
    - SparseCategoricalCrossEntropy

In [39]:
model=tf.keras.models.Sequential([
    tf.keras.layers.Dense(units=1024,activation='relu',kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dense(units=1024,activation='relu',kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dense(units=128,activation='relu',kernel_regularizer=tf.keras.regularizers.L2(0.001)),
    tf.keras.layers.Dense(units=14,activation='linear',kernel_regularizer=tf.keras.regularizers.L2(0.001))
])

model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    metrics=['accuracy']
)

### Normalization

- Normalization is process in which we rescale our data for faster computation
- Here we will be using z-score-normalization by implementing `StandardScaler` on the train data 

In [29]:
scaler=StandardScaler()
scaler.fit_transform(X_train)
X_train_scaled=scaler.transform(X_train)
X_cv_scaled=scaler.transform(X_cv)
X_test_scaled=scaler.transform(X_test)

### Callbacks

- Callbacks can be utilized for premature termination of training
- Based on conditions, we can set the `self.model.stop_training` boolean to `True`
- The conditions can be checked based on real time data from the `logs` parameter that hold the values of loss and other metrics that were configured into the model during the compile statement

In [38]:
class myCallback(tf.keras.callbacks.Callback):
    def __init__(self):
        self.prev_loss=0
    def on_epoch_end(self,epoch,logs={}):
        if(epoch>0 and logs.get('loss')>self.prev_loss):
            print('\nLearning rate updated')
            self.model.optimizer.lr.assign(self.model.optimizer.lr.read_value()*0.1)
        if(logs.get('accuracy')>0.94):
            print('\nRequired Accuracy Met!')
            self.model.stop_training=True
        self.prev_loss=logs.get('loss')

callbacks=myCallback()

Fit data to the model

In [40]:
tf.random.set_seed(1)
model.fit(X_train_scaled,Y_train,epochs=20,callbacks=[callbacks])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x2788b2cf850>

Evaluate the metrics for each set

In [41]:
model.evaluate(X_train_scaled,Y_train)
model.evaluate(X_cv_scaled,Y_cv)
model.evaluate(X_test_scaled,Y_test)



[0.27602505683898926, 0.8900101184844971]

In [42]:
y_predict=tf.nn.softmax(model.predict(X_test_scaled))
y_predict.shape



TensorShape([24675, 14])

In [43]:
y_predict=np.argmax(y_predict,axis=1)
y_predict.shape

(24675,)

In [58]:
pd.DataFrame(Y_test[np.where(y_predict==Y_test)]).value_counts()

13    3105
7     3016
4     2963
1     2881
6     2868
8     2698
9     1443
10    1271
0      676
11     627
5      312
2       74
3       27
Name: count, dtype: int64

In [60]:
pd.DataFrame(y_predict[np.where(y_predict!=Y_test)]).value_counts()

8     1736
0      395
10     354
11     168
1       12
2       12
6       10
4        9
7        7
9        6
13       4
5        1
Name: count, dtype: int64

In [65]:
pd.DataFrame(Y_test[np.where(y_predict!=Y_test)]).value_counts()

10    1736
11     409
8      354
0      106
2       49
3       28
12      12
6        9
7        9
1        1
5        1
Name: count, dtype: int64

In [57]:
X_test_scaled.shape

(24675, 78)