# 1. Preprocessing of NSL-KDD dataset 
## The file KDDTrain+.txt and KDDTest+.txt can be downloaded from this [link](https://drive.google.com/drive/folders/1VozlSOkxCxDyhNF4osTUYr3JWnjT1yxM?usp=sharing)

---




##1.1 Loading the dataset

In [1]:
# importing required libraries for Part 1: Data Preprocessing
import numpy as np
import pandas as pd
# importing required libraries for normalizing data
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

<img src = "https://drive.google.com/uc?id=172J1lUfNWg_PVgssue63ndJeZHewsRri">


In [2]:
# Loading the data
def Load(fileName, description = False):
  # Manually setting up the features
  col_names = ["duration","protocol_type","service","flag","src_bytes",
    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
    "logged_in","num_compromised","root_shell","su_attempted","num_root",
    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
    "is_host_login","is_guest_login","count","srv_count","serror_rate",
    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty_level"]
  # importing dataset
  data = pd.read_csv(fileName,header=None, names=col_names)
  if description:
    data.describe()
  return data

## 1.2 Dataset features

In [3]:
data = Load("KDDTrain+.txt", True)
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,label,difficulty_level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


## 1.3 Normalisation

In [4]:
# Normalisation of data
def Normalisation(data):
  # selecting numeric attributes columns from data
  numeric_col = data.select_dtypes(include='number').columns
  std_scaler = StandardScaler()
  df = data.copy()
  for i in numeric_col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
  return df

## 1.4 Encoding the categorical data

In [5]:
# Categorial to numeric data
def Numericalization(data, cat_col):
  for i in cat_col:
    # Create the One Hot Encode DataFrame
    dum = pd.get_dummies(data[i])
    # Insert into the dataset DataFrame by Series
    for column_name in list(dum.columns):
        data.insert(1, str(i)+column_name, dum[column_name])
        data[str(i)+column_name] = data[str(i)+column_name].astype('int64')
    # Drop the old attribute's column
    data.drop(i, inplace=True, axis=1)
  return data

## 1.5 Feature selection

In [6]:
# Feature Selection : Dropping attributes
def featureSelect(data, colToDrop):
  data.drop(colToDrop, axis=1, inplace=True)
  return data

## 1.6 Transforming the problem into binary classification

In [7]:
# Replace all attack types with "attack"
def replaceLabel(data):
  # 1 intrusion, 0 normal data
  data.loc[data["label"] == "normal", "label"] = 0
  data.loc[data["label"] != 0, "label"] = 1
  return data

## 1.7 Preprocessing function

In [8]:
def preProcessing(fileNameTrain, fileNameTest):
  dataTrain = Load(fileNameTrain, False)
  dataTest = Load(fileNameTest, False)
  data = pd.concat([dataTrain, dataTest], axis=0)
  colToDrop = ['difficulty_level', 'num_outbound_cmds','duration', 'src_bytes', 'dst_bytes','su_attempted','num_root', 'num_file_creations', 'num_shells', 'num_access_files']
  df = featureSelect(data, colToDrop)
  cat_col = ['protocol_type','service','flag']
  df = Numericalization(df, cat_col)
  df = Normalisation(data)
  df = replaceLabel(df)
  return df

In [9]:
data = preProcessing('KDDTrain+.txt','KDDTest+.txt')
#data


## 1.8 Splitting the dataset into training and test

In [10]:
def splitData(data, trainSize):
  df_training = data[:trainSize]    
  df_testing = data[trainSize:]
  return df_training, df_testing


In [11]:
df, dt = splitData(data, 126144)
print("shape of training dataset ", df.shape)
print("shape of inference dataset ", dt.shape)

shape of training dataset  (126144, 114)
shape of inference dataset  (22373, 114)


# 2. Seq2Seq LSTM NIDS

## 2.1 Eliminating rows of the dataset

In [12]:
# Importing the packages
import tensorflow as tf

import tensorflow.keras as keras
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from numpy import array
from numpy import array_equal


In [13]:
def eliminateRows(data, rowsToKeep):
  IndexesToDrop = [i for i in range(rowsToKeep, data.shape[0],1 )]
  data = data.drop(IndexesToDrop)
  return data



```
# Ce texte est au format code
```

## 2.2 Getting the label Y

In [14]:
def LabelCol(data):
  Y = []
  Y = pd.DataFrame(Y)
  Y['label'] = data['label'].values
  data.drop('label', axis=1, inplace=True)
  return data, Y

## 2.3 Transforming the dataset to an LSTM form

> Bloc en retrait


## Input data to LSTM: (sequences, time steps, features)
## Train data shape : (126144, 1, 113) 
## Test data shape : (22272, 1, 113)

In [15]:
# # Input data to LSTM: (sequences, time steps, features) = (656, 192, 114)
import tensorflow as tf
import numpy as np
def sequences(df, n, length, features):
  samples = list()
  for i in range(0,n,length):
    sample =df[i:i+length]
    samples.append(sample)
  data = np.array(samples)
  data = np.reshape(data,(len(samples) , length, features))
  return data

## 2.4 One hot encoding of label Y

In [16]:
# Preparing Y for seq2seq 
def prepareY(data, Y, rowsPerSeq):
  Yencoded = to_categorical([Y], num_classes=2)
  # Adding shifting column to Y
  #NewY = Yencoded.reshape(data.shape[0], 2 )
  NewY = Yencoded.reshape(data.shape[0], 2 )
  #x2 = np.insert(NewY, 0, values=-1, axis=1)
  #y = np.insert(NewY, 0, values=0, axis=1)
  x2 = NewY
  y = NewY
  y = sequences(y, y.shape[0], rowsPerSeq, y.shape[1])
  x2 = sequences(x2, x2.shape[0], rowsPerSeq, x2.shape[1])
  return x2, y

## 2.5 Preparing data for LSTM cells

In [17]:
# Preparing training data for seq2seq LSTM
def dataSeq(df, rowsToKeep, rowsPerSeq):
  # Eliminating some rows from dataset
  df = eliminateRows(df, rowsToKeep)
  # Creating a column of the class : Y
  df , Y = LabelCol(df)
  # Preparing y: predicted class and x2: shifted class labels
  x2, y = prepareY(df, Y, rowsPerSeq)
  x1 = sequences(df, df.shape[0], rowsPerSeq, df.shape[1])
  return x1, x2, y

In [18]:
# Preparing training data for seq2seq LSTM
x1, x2, y = dataSeq(df, 126144, 1)
print(x1.shape, x2.shape, y.shape)



(126144, 1, 113) (126144, 1, 2) (126144, 1, 2)


In [19]:
# Prepare Test Set 
xt1, xt2, yt = dataSeq(dt, 22272, 1)
print(xt1.shape, xt2.shape, yt.shape)


(22272, 1, 113) (22272, 1, 2) (22272, 1, 2)


## 2.5 Evaluation metrics

> Bloc en retrait



## 2.6 Creation du modèle
<img src = "https://drive.google.com/uc?id=1T47KP3a0-14XzJV_NCuXPGokJ_TBqNlO" height = "400" width = "800" > 



In [22]:
!pip install keras-metrics
import keras_metrics

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-metrics
  Downloading keras_metrics-1.1.0-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: keras-metrics
Successfully installed keras-metrics-1.1.0


In [23]:
from tensorflow import keras
from tensorflow.keras.layers import Lambda
from tensorflow.keras import backend as K
import numpy as np



n_timesteps_in = 1
n_features = 113
out_features = 2
numberOfLSTMunits = 256

def create_hard_coded_decoder_input_model(batch_size):
  # The first part is encoder
  encoder_inputs = Input(shape=(n_timesteps_in, n_features), name='encoder_inputs')
  encoder_lstm = LSTM(numberOfLSTMunits,return_sequences=True, return_state=True,  name='encoder_lstm')
  encoder_outputs, state_h1, state_c1 = encoder_lstm(encoder_inputs)
  encoder_lstm2 = LSTM(numberOfLSTMunits,return_sequences=True, return_state=True,  name='encoder_lstm2')
  _, state_h2, state_c2 = encoder_lstm2(encoder_outputs) 

  
  # initial context vector is the states of the encoder
  states = [state_h1, state_c1, state_h2, state_c2]


  
  # Set up the decoder layers
  # Attention: decoder receives 1 token at a time &
  # decoder outputs 1 token at a time 
  
  decoder_inputs = Input(shape=(1, out_features),  name='decoder_inputs')
  decoder_lstm = LSTM(numberOfLSTMunits, return_sequences=True, return_state=True, name='decoder_lstm')
  
  # Second LSTM
  decoder_lstm2 = LSTM(numberOfLSTMunits, return_sequences=True, return_state=True, name='decoder_lstm2')


  decoder_dense = Dense(out_features, activation='softmax',  name='decoder_dense')
  # New input decoder
  all_outputs = []
  decoder_input_data = np.zeros((batch_size, 1, out_features))
  decoder_input_data[:, 0, 0] = -1 
  inputs = decoder_input_data
  states1 = [state_h1, state_c1]
  states2 = [state_h2, state_c2]
  #print(inputs.shape)
  for _ in range(n_timesteps_in):
      # Run the decoder on one time step
      #outputs, state_h, state_c = decoder_lstm(inputs, initial_state=states)
      outputs, dh1, dc1 = decoder_lstm(inputs,initial_state= states1)
      final, dh2, dc2 = decoder_lstm2(outputs, initial_state=states2)

      outputs = decoder_dense(final)
      # Store the current prediction (we will concatenate all predictions later)
      all_outputs.append(outputs)
      # Reinject the outputs as inputs for the next loop iteration
      # as well as update the states
      inputs = outputs
      states1 = [state_h1, state_c1]
      states2 = [state_h2, state_c2]
  decoder_outputs = Lambda(lambda x: K.concatenate(x, axis=1))(all_outputs)




  #encoder_model = Model(encoder_inputs, states)
  #decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states)
  # Define and compile model 
  model = Model(encoder_inputs, decoder_outputs, name='model_encoder_decoder')
  model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=["accuracy", keras_metrics.precision(), keras_metrics.recall(), tf.keras.metrics.AUC()])
  return model

## 2.7 SSummary of the model

In [24]:
batch_size = 192
model_encoder_decoder=create_hard_coded_decoder_input_model(batch_size=batch_size)
model_encoder_decoder.summary()

Model: "model_encoder_decoder"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, 1, 113)]     0           []                               
                                                                                                  
 encoder_lstm (LSTM)            [(None, 1, 256),     378880      ['encoder_inputs[0][0]']         
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                                  
 decoder_lstm (LSTM)            [(192, 1, 256),      265216      ['encoder_lstm[0][1]',           
                                 (None, 256),                     'encoder_lst

## 2.8 Training of the model

In [26]:
import time
start_time = time.time()
model_encoder_decoder.fit(x1, y,
          batch_size=batch_size,
          epochs=1)
print("--- %s seconds ---" % (time.time() - start_time))

--- 82.19271516799927 seconds ---


## 2.9 Metrics for test data

In [27]:
_, test_acc, test_precision, test_recall, test_auc = model_encoder_decoder.evaluate(xt1,yt, batch_size=batch_size, verbose=0)
print("Test accuracy ", test_acc, " Test precision ", test_precision, " Test recall ", test_recall, " Test AUC", test_auc)



Test accuracy  0.7928340435028076  Test precision  0.9722961187362671  Test recall  0.9897193908691406  Test AUC 0.8696174621582031


# 2.10 Saving the model

In [28]:
import tensorflow as tf
model = model_encoder_decoder
tf.saved_model.save(model,'NIDS')



# FIN DU CODE