In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, auc, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

import utils

# global variable
TRAIN_TXT_PATH = "./data/KDDTrain+.txt"
METADATA_PATH = "./data/KDDTrain+.arff"
TEST_PATH = "./data/KDDTest+.txt"
TEST_EXC_21_PATH = "./data/KDDTest-21.txt"
SEED = 111
LABEL = "class"
EPOCHS = 5 # no. of iteration over data


In [4]:
# load and process data
df = pd.read_csv(TRAIN_TXT_PATH)

def pre_pre_process_data(df):
    df.columns = utils.get_col_names(METADATA_PATH)
    df = utils.convert_label_to_binary(df, LABEL)
    df = utils.get_numeric_cols(df)
    return df

df = pre_pre_process_data(df)
df



Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,level
0,0,146,0,0,0,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,0,15
1,0,0,0,0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1,19
2,0,232,8153,0,0,0,0,0,1,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,0,21
3,0,199,420,0,0,0,0,0,1,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0,21
4,0,0,0,0,0,0,0,0,0,0,...,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.00,1,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125967,0,0,0,0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,1,20
125968,8,105,145,0,0,0,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,0,21
125969,0,2231,384,0,0,0,0,0,1,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,0,18
125970,0,0,0,0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,1,20


In [5]:
# X-y split (features & label) - avoid data-leakage
X = df.drop(columns=["class", "level"], axis=1)
FEATURES = X.columns

y = df[LABEL]

# train-val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=SEED)
print(f"{X_train.shape=} {y_train.shape=} {X_val.shape=} {y_val.shape=}")
display(X_train)
display(y_train)


X_train.shape=(88180, 38) y_train.shape=(88180,) X_val.shape=(37792, 38) y_val.shape=(37792,)


Unnamed: 0,duration,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,logged_in,num_compromised,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
118060,0,1032,0,0,0,0,0,0,0,0,...,255,255,1.00,0.00,1.00,0.00,0.0,0.0,0.0,0.0
437,0,222,310,0,0,0,0,0,1,0,...,170,255,1.00,0.00,0.01,0.01,0.0,0.0,0.0,0.0
95153,0,293,1680,0,0,0,0,0,1,0,...,255,255,1.00,0.00,0.01,0.00,0.0,0.0,0.0,0.0
83297,0,0,0,0,0,0,0,0,0,0,...,255,4,0.02,0.05,0.00,0.00,1.0,1.0,0.0,0.0
85083,0,313,259,0,0,0,0,0,1,0,...,129,255,1.00,0.00,0.01,0.02,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105747,3122,146,105,0,0,0,0,0,0,0,...,255,1,0.00,0.66,0.96,0.00,0.0,0.0,0.0,0.0
102486,0,225,5304,0,0,0,0,0,1,0,...,38,255,1.00,0.00,0.03,0.01,0.0,0.0,0.0,0.0
4820,0,226,1110,0,0,0,0,0,1,0,...,41,255,1.00,0.00,0.02,0.07,0.0,0.0,0.0,0.0
10196,0,8766,0,0,0,0,0,0,1,0,...,133,48,0.32,0.04,0.32,0.04,0.0,0.0,0.0,0.0


118060    1
437       0
95153     0
83297     1
85083     0
         ..
105747    0
102486    0
4820      0
10196     0
77652     1
Name: class, Length: 88180, dtype: int64

In [6]:
# Reshape input data to add timestep dimension
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))

In [7]:
model = tf.keras.Sequential([
    LSTM(units=64, activation='relu', input_shape=(1, X_train.shape[2]), 
         kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), 
         bias_regularizer=tf.keras.regularizers.L2(1e-4),
         activity_regularizer=tf.keras.regularizers.L2(1e-5),
         return_sequences=True),
    Dropout(0.4),
    LSTM(units=128, activation='relu', 
         kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), 
         bias_regularizer=tf.keras.regularizers.L2(1e-4),
         activity_regularizer=tf.keras.regularizers.L2(1e-5),
         return_sequences=True),
    Dropout(0.4),
    LSTM(units=512, activation='relu', 
         kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), 
         bias_regularizer=tf.keras.regularizers.L2(1e-4),
         activity_regularizer=tf.keras.regularizers.L2(1e-5),
         return_sequences=False),
    Dropout(0.4),
    Dense(units=128, activation='relu', 
          kernel_regularizer=tf.keras.regularizers.L1L2(l1=1e-5, l2=1e-4), 
          bias_regularizer=tf.keras.regularizers.L2(1e-4),
          activity_regularizer=tf.keras.regularizers.L2(1e-5)),
    Dropout(0.4),
    Dense(units=1, activation='sigmoid'),
])

model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=EPOCHS, verbose=1)


Epoch 1/5


  super().__init__(**kwargs)
  output, from_logits = _get_logits(


[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 12ms/step - accuracy: 0.8907 - loss: 6974552.5000 - val_accuracy: 0.9609 - val_loss: 0.2257
Epoch 2/5
[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 12ms/step - accuracy: 0.9523 - loss: 9.7498 - val_accuracy: 0.9599 - val_loss: 0.2073
Epoch 3/5
[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 11ms/step - accuracy: 0.9539 - loss: 0.2270 - val_accuracy: 0.9607 - val_loss: 0.2093
Epoch 4/5
[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 11ms/step - accuracy: 0.9502 - loss: 53.6181 - val_accuracy: 0.9576 - val_loss: 0.2017
Epoch 5/5
[1m2756/2756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 11ms/step - accuracy: 0.9553 - loss: 0.2214 - val_accuracy: 0.9552 - val_loss: 0.2108


In [8]:
# load test data
df_test = pd.read_csv(TEST_PATH)
display(df_test)
df_test = pre_pre_process_data(df_test)

X_test = df_test[FEATURES]
y_test = df_test[LABEL]

# Reshape input data to add timestep dimension
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

y_test_pred = model.predict(X_test)
y_test_pred = list(map(lambda x:1 if x>=0.5 else 0, y_test_pred[:, 0]))


Unnamed: 0,0,tcp,private,REJ,0.1,0.2,0.3,0.4,0.5,0.6,...,0.04.1,0.06.1,0.00.3,0.00.4,0.00.5,0.00.6,1.00.2,1.00.3,neptune,21
0,0,tcp,private,REJ,0,0,0,0,0,0,...,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,neptune,21
1,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,normal,21
2,0,icmp,eco_i,SF,20,0,0,0,0,0,...,1.00,0.00,1.00,0.28,0.00,0.0,0.00,0.00,saint,15
3,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,0.31,0.17,0.03,0.02,0.00,0.0,0.83,0.71,mscan,11
4,0,tcp,http,SF,267,14515,0,0,0,0,...,1.00,0.00,0.01,0.03,0.01,0.0,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22539,0,tcp,http,SF,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,normal,21
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,back,15
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,0.99,0.01,0.00,0.00,0.00,0.0,0.00,0.00,normal,21


[1m705/705[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [12]:
print("Neural-Network (LSTM) MODEL:")
print("AUC: ", roc_auc_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred, digits=4), end="---\n")


Neural-Network (LSTM) MODEL:
AUC:  0.7615625195005997
[[9063  648]
 [5263 7569]]
              precision    recall  f1-score   support

           0     0.6326    0.9333    0.7541      9711
           1     0.9211    0.5899    0.7192     12832

    accuracy                         0.7378     22543
   macro avg     0.7769    0.7616    0.7366     22543
weighted avg     0.7969    0.7378    0.7342     22543
---
