In [6]:
# Importing necessary libraries
import os
import sys
import time
import logging
import importlib

# Importing data handling and visualization libraries
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm
import matplotlib.pyplot as plt

# Import machine learning and deep learning libraries
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support, confusion_matrix, accuracy_score

import tensorflow as tf
from tensorflow.keras import initializers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout, LeakyReLU, Input, BatchNormalization, Reshape, Flatten, Activation

import pickle
import random

# here we setting a random seed for reproducability
random.seed(42)

# Data directory containing the KDD dataset
DATA_DIRECTORY = './data/'

# Function to normalize data the into L2-norm for easier processing
def normalize_dataset(data):
    return data / np.linalg.norm(data, axis=0)

# Loading the preprocessed KDD dataset. The data set is created into a  pickle file
data_file_path = os.path.join(DATA_DIRECTORY, 'preprocessed_data_full.pkl')
with open(data_file_path, 'rb') as file:
    network_data = pickle.load(file)

# here we are Extracting the training and testing datasets
label_encoder = network_data['le']
x_train_data = network_data['x_train']
y_train_data = network_data['y_train']
x_test_data = network_data['x_test']
y_test_data = network_data['y_test']

# Convert labels to binary for anomaly detection
y_train_data = np.where(y_train_data != 11, 1, 0)
y_test_data = np.where(y_test_data != 11, 1, 0)

# Subset only normal network packets for training
normal_data = x_train_data[y_train_data == 0]

# Scale data using MinMaxScaler
scaler = MinMaxScaler()
scaled_x_train = scaler.fit_transform(normal_data)
scaled_x_test = scaler.transform(x_test_data)

# Organize dataset into a dictionary
processed_dataset = {
    'x_train': scaled_x_train.astype(np.float32),
    'y_train': y_train_data.astype(np.float32),
    'x_test': scaled_x_test.astype(np.float32),
    'y_test': y_test_data.astype(np.float32)
}

# Defining the generator model 
def create_generator_model(optimizer):
    model = Sequential([
        Dense(64, input_dim=114, kernel_initializer=initializers.GlorotNormal(seed=42)),
        Activation('tanh'),
        Dense(128),
        Activation('tanh'),
        Dense(256),
        Activation('tanh'),
        Dense(114, activation='tanh')
    ])
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

# Define the discriminator model
def create_discriminator_model(optimizer):
    model = Sequential([
        Dense(256, input_dim=114, kernel_initializer=initializers.GlorotNormal(seed=42)),
        LeakyReLU(alpha=0.2),
        Dropout(0.2),
        Dense(128),
        LeakyReLU(alpha=0.2),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return model

# Define GAN model
def create_gan_model(generator, discriminator, optimizer):
    discriminator.trainable = False
    gan_input = Input(shape=(114,))
    x = generator(gan_input)
    gan_output = discriminator(x)
    gan_model = Model(inputs=gan_input, outputs=gan_output)
    gan_model.compile(loss='binary_crossentropy', optimizer=optimizer)
    return gan_model

# Training parameters
LEARNING_RATE = 0.0001
BATCH_SIZE = 512
EPOCHS = 10
adam_optimizer = Adam(learning_rate=LEARNING_RATE, beta_1=0.5)

# Initialize models
generator = create_generator_model(adam_optimizer)
discriminator = create_discriminator_model(adam_optimizer)
gan_model = create_gan_model(generator, discriminator, adam_optimizer)

# Training loop
gan_losses = []
discriminator_losses = []

for epoch in range(EPOCHS):
    for batch in range(scaled_x_train.shape[0] // BATCH_SIZE):
        noise = np.random.normal(0, 1, size=(BATCH_SIZE, 114))
        fake_data = generator.predict_on_batch(noise)
        real_data = scaled_x_train[batch * BATCH_SIZE:(batch + 1) * BATCH_SIZE]
        
        X = np.vstack([fake_data, real_data])
        y_dis = np.hstack([np.zeros(BATCH_SIZE), np.ones(BATCH_SIZE)])

        discriminator.trainable = True
        d_loss = discriminator.train_on_batch(X, y_dis)

        noise = np.random.normal(0, 1, size=(BATCH_SIZE, 114))
        y_gen = np.ones(BATCH_SIZE)
        discriminator.trainable = False
        g_loss = gan_model.train_on_batch(noise, y_gen)

        discriminator_losses.append(d_loss)
        gan_losses.append(g_loss)

    print(f"Epoch {epoch+1}/{EPOCHS} [D loss: {d_loss}] [G loss: {g_loss}]")

# Plot training losses
plt.figure(figsize=(10, 5))
plt.plot(discriminator_losses, label='Discriminator Loss', color='red')
plt.plot(gan_losses, label='Generator Loss', color='blue')
plt.title('Training Losses')
plt.xlabel('Batch Count')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

# Evaluate model on test set
results = discriminator.predict(scaled_x_test)
threshold = np.percentile(results, 1)

predictions = (results > threshold).astype(int)
accuracy = accuracy_score(processed_dataset['y_test'], predictions)
precision, recall, f1, _ = precision_recall_fscore_support(processed_dataset['y_test'], predictions, average='binary')

print(f"Accuracy is: {accuracy}")
print(f"Precision is: {precision}")
print(f"Recall is: {recall}")
print(f"F1 Score is: {f1}")

# Trying to create the confusion matrix
cm = confusion_matrix(processed_dataset['y_test'], predictions)
plt.figure(figsize=(8, 8))
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.colorbar()
plt.xticks([0, 1], ['Normal', 'Anomalous'])
plt.yticks([0, 1], ['Normal', 'Anomalous'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, format(cm[i, j], 'd'), horizontalalignment='center')

plt.tight_layout()
plt.show()

# ROC Curve
fpr, tpr, _ = roc_curve(processed_dataset['y_test'], results)
roc_auc = auc(fpr, tpr)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()


ModuleNotFoundError: No module named 'sklearn.preprocessing.label'