In [None]:
import os
import random
import datetime

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, BatchNormalization, LeakyReLU
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard
from tensorflow.keras import initializers, regularizers
import joblib

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [None]:
RANDOM_SEED = 33
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)
tf.keras.utils.set_random_seed(RANDOM_SEED)

In [None]:
os.makedirs('Models/', exist_ok=True)

model_path = 'Models/autoencoder.keras'
scaler_path = 'Models/scaler.joblib'
threshold_file = 'Models/threshold.txt'

monday_data_path = 'Data/CICIDS2017_labeled/monday_labeled.csv'
tuesday_data_path = 'Data/CICIDS2017_labeled/tuesday_labeled.csv'
wednesday_data_path = 'Data/CICIDS2017_labeled/wednesday_labeled.csv'
thursday_data_path = 'Data/CICIDS2017_labeled/thursday_labeled.csv'
friday_data_path = 'Data/CICIDS2017_labeled/friday_labeled.csv'

In [None]:
train_df = pd.read_csv(monday_data_path)
test_df_tue = pd.read_csv(tuesday_data_path)
test_df_wed = pd.read_csv(wednesday_data_path)
test_df_thu = pd.read_csv(thursday_data_path)
test_df_fri = pd.read_csv(friday_data_path)

In [None]:
excluded_cols = ['Flow ID','Src IP','Src Port','Dst IP','Dst Port','Protocol','Timestamp',
                 'Out of order packets','Malformed packets','Direction guessed',
                 'Fwd URG Flags','Bwd URG Flags','URG Flag Count',
                 'TCP Bwd invalid seq',
                 'Label']

X = train_df.drop(columns=excluded_cols, axis=1)
y = train_df['Label']

X_test_tue = test_df_tue.drop(columns=excluded_cols, axis=1)
y_test_tue = test_df_tue['Label']

X_test_wed = test_df_wed.drop(columns=excluded_cols, axis=1)
y_test_wed = test_df_wed['Label']

X_test_thu = test_df_thu.drop(columns=excluded_cols, axis=1)
y_test_thu = test_df_thu['Label']

X_test_fri = test_df_fri.drop(columns=excluded_cols, axis=1)
y_test_fri = test_df_fri['Label']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
)

print(f"Training shape: {X_train.shape}, val. shape: {X_val.shape}, tue. test shape: {X_test_tue.shape}, wed. test shape: {X_test_wed.shape}, thu. test shape: {X_test_thu.shape}, , fri. test shape: {X_test_fri.shape}")

In [None]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_tue_scaled = scaler.transform(X_test_tue)
X_test_wed_scaled = scaler.transform(X_test_wed)
X_test_thu_scaled = scaler.transform(X_test_thu)
X_test_fri_scaled = scaler.transform(X_test_fri)

joblib.dump(scaler, scaler_path)

In [None]:
input_dim = X_train_scaled.shape[1]
leaky_slope = 0.1

# Encoder
input_layer = Input(shape=(input_dim, ))
layer = Dense(56, kernel_initializer=initializers.he_normal())(input_layer)
layer = BatchNormalization()(layer)
layer = LeakyReLU(negative_slope=leaky_slope)(layer)

# Bottleneck
layer = Dense(8, kernel_initializer=initializers.he_normal(),
              activity_regularizer=regularizers.l1(1e-5))(layer)
layer = BatchNormalization()(layer)
layer = LeakyReLU(negative_slope=leaky_slope)(layer)

# Decoder
layer = Dense(56, kernel_initializer=initializers.he_normal())(layer)
layer = BatchNormalization()(layer)
layer = LeakyReLU(negative_slope=leaky_slope)(layer)

# Output layer
output_layer = Dense(input_dim, activation='sigmoid',
                     kernel_initializer=initializers.he_normal())(layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

In [None]:
checkpoint_cb = ModelCheckpoint(
    model_path, save_best_only=True, monitor='val_loss', mode='min', verbose=1
)

earlystop_cb = EarlyStopping(
    monitor='val_loss', mode='min', patience=24, restore_best_weights=True, verbose=1
)

reducelr_cb = ReduceLROnPlateau(
    monitor='val_loss', mode='min', factor=0.1, patience=14, cooldown=10, min_lr=1e-6, verbose=1
)

In [None]:
%load_ext tensorboard

In [None]:
log_dir = f"logs/fit/{datetime.datetime.now().strftime('%Y%m%d-%H%M%S')}"

tensorboard_cb = TensorBoard(
    log_dir=log_dir,
    histogram_freq=1
)

In [None]:
history = model.fit(
    X_train_scaled, X_train_scaled,
    epochs=256,
    batch_size=512,
    validation_data=(X_val_scaled, X_val_scaled),
    shuffle=True,
    verbose=2,
    callbacks=[checkpoint_cb, earlystop_cb, reducelr_cb, tensorboard_cb]
)

In [None]:
%tensorboard --logdir logs/fit

In [None]:
model = load_model(model_path)

In [None]:
def compute_mse(X_true, X_pred):
    return np.mean(np.square(X_true-X_pred), axis=1)

In [None]:
X_val_pred = model.predict(X_val_scaled)
val_mse = compute_mse(X_val_scaled, X_val_pred)
threshold = np.percentile(val_mse, 92)

with open(threshold_file, "w") as f:
    f.write(str(threshold))

In [None]:
def predict(model, X_evaluation, threshold):
    predictions = model.predict(X_evaluation)
    RE = compute_mse(X_evaluation, predictions)
    return RE <= threshold # True = BENIGN, False = ATTACK

In [None]:
pred_labels_bool_tue = predict(model, X_test_tue_scaled, threshold)
y_pred_tue = np.where(pred_labels_bool_tue, 'BENIGN', 'ATTACK')
y_pred_tue

In [None]:
pred_labels_bool_wed = predict(model, X_test_wed_scaled, threshold)
y_pred_wed = np.where(pred_labels_bool_wed, 'BENIGN', 'ATTACK')
y_pred_wed

In [None]:
pred_labels_bool_thu = predict(model, X_test_thu_scaled, threshold)
y_pred_thu = np.where(pred_labels_bool_thu, 'BENIGN', 'ATTACK')
y_pred_thu

In [None]:
pred_labels_bool_fri = predict(model, X_test_fri_scaled, threshold)
y_pred_fri = np.where(pred_labels_bool_fri, 'BENIGN', 'ATTACK')
y_pred_fri

In [None]:
print(classification_report(y_true=y_test_tue, y_pred=y_pred_tue))

In [None]:
print(classification_report(y_true=y_test_wed, y_pred=y_pred_wed))

In [None]:
print(classification_report(y_true=y_test_thu, y_pred=y_pred_thu))

In [None]:
print(classification_report(y_true=y_test_fri, y_pred=y_pred_fri))

In [None]:
labels = ["BENIGN", "ATTACK"]

cm = confusion_matrix(y_true=y_test_tue, y_pred=y_pred_tue)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
annotations = np.empty_like(cm, dtype=object)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        annotations[i,j] = f"{cm[i,j]}\n({cmn[i,j]:.2%})"
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cmn, annot=annotations, fmt='s', xticklabels=labels, yticklabels=labels, cmap='Blues', vmin=0, vmax=1, square=True, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix - Tuesday')
plt.show()

In [None]:
cm = confusion_matrix(y_true=y_test_wed, y_pred=y_pred_wed)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
annotations = np.empty_like(cm, dtype=object)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        annotations[i,j] = f"{cm[i,j]}\n({cmn[i,j]:.2%})"
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cmn, annot=annotations, fmt='s', xticklabels=labels, yticklabels=labels, cmap='Blues', vmin=0, vmax=1, square=True, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix - Wednesday')
plt.show()

In [None]:
cm = confusion_matrix(y_true=y_test_thu, y_pred=y_pred_thu)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
annotations = np.empty_like(cm, dtype=object)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        annotations[i,j] = f"{cm[i,j]}\n({cmn[i,j]:.2%})"
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cmn, annot=annotations, fmt='s', xticklabels=labels, yticklabels=labels, cmap='Blues', vmin=0, vmax=1, square=True, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix - Thursday')
plt.show()

In [None]:
cm = confusion_matrix(y_true=y_test_fri, y_pred=y_pred_fri)
cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
annotations = np.empty_like(cm, dtype=object)
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        annotations[i,j] = f"{cm[i,j]}\n({cmn[i,j]:.2%})"
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(cmn, annot=annotations, fmt='s', xticklabels=labels, yticklabels=labels, cmap='Blues', vmin=0, vmax=1, square=True, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion matrix - Friday')
plt.show()

In [None]:
color_map = {
    'BENIGN': px.colors.qualitative.Vivid[1],
    'ATTACK': px.colors.qualitative.Vivid[0]
}
threshold_color = 'firebrick'

RE_test = compute_mse(X_test_tue_scaled, model.predict(X_test_tue_scaled))

plot_df = pd.DataFrame({
    'Index': np.arange(len(RE_test)),
    'RE': RE_test,
    'TrueLabel': y_test_tue.values
})

fig = px.scatter(
    plot_df,
    x='Index',
    y='RE',
    color='TrueLabel',
    symbol='TrueLabel',
    color_discrete_map=color_map,
    template='presentation'
)

fig.update_traces(
    marker=dict(
        line=dict(width=0),
        size=4,
        opacity=0.7
    )
)

len_RE = len(RE_test)
x0 = -0.05 * len_RE
x1 = len_RE * 1.05
fig.add_shape(
    type='line',
    x0=x0,
    y0=threshold,
    x1=x1,
    y1=threshold,
    line=dict(color=threshold_color, width=5, dash='dash'),
    layer='above'
)

fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    line=dict(color=threshold_color, width=5, dash='dash'),
    name=f'Threshold ({threshold:.5f})'
))

fig.update_layout(
    xaxis_title='Index',
    yaxis_title='RE (MSE)',
    title='Reconstruction error diagram - Tuesday',
    yaxis_type='log'
)

fig.show()

In [None]:
RE_test = compute_mse(X_test_wed_scaled, model.predict(X_test_wed_scaled))

plot_df = pd.DataFrame({
    'Index': np.arange(len(RE_test)),
    'RE': RE_test,
    'TrueLabel': y_test_wed.values
})

fig = px.scatter(
    plot_df,
    x='Index',
    y='RE',
    color='TrueLabel',
    symbol='TrueLabel',
    color_discrete_map=color_map,
    template='presentation'
)

fig.update_traces(
    marker=dict(
        line=dict(width=0),
        size=4,
        opacity=0.7
    )
)

len_RE = len(RE_test)
x0 = -0.05 * len_RE
x1 = len_RE * 1.05
fig.add_shape(
    type='line',
    x0=x0,
    y0=threshold,
    x1=x1,
    y1=threshold,
    line=dict(color=threshold_color, width=5, dash='dash'),
    layer='above'
)

fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    line=dict(color=threshold_color, width=5, dash='dash'),
    name=f'Threshold ({threshold:.5f})'
))

fig.update_layout(
    xaxis_title='Index',
    yaxis_title='RE (MSE)',
    title='Reconstruction error diagram - Wednesday',
    yaxis_type='log'
)

fig.show()

In [None]:
RE_test = compute_mse(X_test_thu_scaled, model.predict(X_test_thu_scaled))

plot_df = pd.DataFrame({
    'Index': np.arange(len(RE_test)),
    'RE': RE_test,
    'TrueLabel': y_test_thu.values
})

fig = px.scatter(
    plot_df,
    x='Index',
    y='RE',
    color='TrueLabel',
    symbol='TrueLabel',
    color_discrete_map=color_map,
    template='presentation'
)

fig.update_traces(
    marker=dict(
        line=dict(width=0),
        size=4,
        opacity=0.7
    )
)

len_RE = len(RE_test)
x0 = -0.05 * len_RE
x1 = len_RE * 1.05
fig.add_shape(
    type='line',
    x0=x0,
    y0=threshold,
    x1=x1,
    y1=threshold,
    line=dict(color=threshold_color, width=5, dash='dash'),
    layer='above'
)

fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    line=dict(color=threshold_color, width=5, dash='dash'),
    name=f'Threshold ({threshold:.5f})'
))

fig.update_layout(
    xaxis_title='Index',
    yaxis_title='RE (MSE)',
    title='Reconstruction error diagram - Thursday',
    yaxis_type='log'
)

fig.show()

In [None]:
RE_test = compute_mse(X_test_fri_scaled, model.predict(X_test_fri_scaled))

plot_df = pd.DataFrame({
    'Index': np.arange(len(RE_test)),
    'RE': RE_test,
    'TrueLabel': y_test_fri.values
})

fig = px.scatter(
    plot_df,
    x='Index',
    y='RE',
    color='TrueLabel',
    symbol='TrueLabel',
    color_discrete_map=color_map,
    template='presentation'
)

fig.update_traces(
    marker=dict(
        line=dict(width=0),
        size=4,
        opacity=0.7
    )
)

len_RE = len(RE_test)
x0 = -0.05 * len_RE
x1 = len_RE * 1.05
fig.add_shape(
    type='line',
    x0=x0,
    y0=threshold,
    x1=x1,
    y1=threshold,
    line=dict(color=threshold_color, width=5, dash='dash'),
    layer='above'
)

fig.add_trace(go.Scatter(
    x=[None],
    y=[None],
    mode='lines',
    line=dict(color=threshold_color, width=5, dash='dash'),
    name=f'Threshold ({threshold:.5f})'
))

fig.update_layout(
    xaxis_title='Index',
    yaxis_title='RE (MSE)',
    title='Reconstruction error diagram - Friday',
    yaxis_type='log'
)

fig.show()