# SPECTRE-CPU-V2
> Trained with **CICIDS2017**

# SETUP PRE-REQUISITES

In [1]:
import os
import platform
import sys
import glob

import tensorflow as tf
#from tensorflow.keras import layers

#import keras
from tensorflow import keras
from keras import layers
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.layers import Dense, Flatten, Input
from keras.models import Model
from keras import layers, models

import pandas as pd

from sklearn.decomposition import PCA # For PCA dimensionality reduction technique
from sklearn.preprocessing import StandardScaler # For scaling to unit scale, before PCA application
from sklearn.preprocessing import LabelBinarizer # For converting categorical data into numeric, for modeling stage
from sklearn.model_selection import StratifiedKFold # For optimal train_test splitting, for model input data
from sklearn.model_selection import train_test_split # For basic dataset splitting

import dask.dataframe as dd
from dask import delayed

def escape():
    sys.exit()

2023-05-15 04:37:27.484433: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-15 04:37:27.620824: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-05-15 04:37:27.621516: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")

Python Platform: Linux-6.3.1-zen2-1-zen-x86_64-with-glibc2.37
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.9.16 | packaged by conda-forge | (main, Feb  1 2023, 21:39:03) 
[GCC 11.3.0]


## ENVIRONMENT SETUP

**Setup INFO level**

In [3]:
tf.get_logger().setLevel('INFO')

#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

**Useful environment variables**

In [4]:
# Max number of permutations to run. Can be altered for needs.
number_of_permutations = 100

# 10 folds is usually the heuristic to follow for larger datasets of around this size.
num_of_splits_for_skf = 10

# Seed value to pass into models so that repeated runs result in the same output
seed_val = 1

# Number of statistical distance measures to run (for the results, columns section)
num_of_statistical_dist_measures = 6

# ANN TRAINING

## Dataset Import

**DDoS Dataset - Kaggle**
- Use DASK module for large dataset

In [6]:
#DDoS_Kaggle = pd.read_csv('../../dataset/DDoS_Dataset/ddos_balanced/final_dataset.csv')
#df = DDoS_Kaggle.copy()
#df.head()

npartitions = 10  # Adjust this value based on your available RAM
DDoS_Kaggle = dd.read_csv('../../dataset/DDoS_Dataset/ddos_balanced/final_dataset.csv', assume_missing=True, blocksize='64MB', npartitions=npartitions)

In [8]:
df_cleaned = DDoS_Kaggle.map_partitions(clean_dataset)
df = df_cleaned

## Data Prepocessing

In [None]:
# 'Reduced dimensions' variable for altering the number of PCA principal components. Can be altered for needs.
# Only 7 principal components needed when using non-normalised PCA dataset.
dimensions_num_for_PCA = 7


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df = df.replace([np.inf, -np.inf], np.nan)  # Replace np.inf and -np.inf with np.nan
    df.dropna(inplace=True)  # Drop rows containing np.nan
    return df

def get_PCA_feature_names(num_of_pca_components):
    feature_names = []
    for i in range(num_of_pca_components):    
        feature_names.append(f"Principal component {i+1}")
    return feature_names

# Renaming columns and creating a copy
df = DDoS_Kaggle.copy()
df = df.rename(columns=lambda x: x.strip().lower().replace(' ', '_').replace('(', '').replace(')', ''))
df_cleaned = clean_dataset(df).compute()

# Resetting index and removing unneeded index column
df_cleaned.reset_index(drop=True, inplace=True)

# Saving the label attribute before dropping it
df_labels = df_cleaned['label']
df_no_labels = df_cleaned.drop('label', axis=1)
df_features = df_no_labels.columns.tolist()

# Scaling the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_no_labels)
df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

# Performing PCA
dimensions_num_for_PCA = 2  # You need to define dimensions_num_for_PCA
pca = PCA(n_components=dimensions_num_for_PCA)
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame with principal components
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

# Concatenating principal components and labels
df_final = pd.concat([df_pc, df_labels], axis=1)

# Applying LabelBinarizer to the labels
lb = LabelBinarizer()
df_final['label'] = lb.fit_transform(df_final['label'])

# Displaying the final DataFrame
df_final

## Training

### Dataset Splitting

**K-Fold Cross Validation and Stratified splitting**

Code reference: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [None]:
# Separating the label so that the answers aren't provided to the model, in training.
X = df_final.drop(['label'], axis = 1)
y = df_final['label']
y

In [None]:
X

In [None]:
skf = StratifiedKFold(n_splits=num_of_splits_for_skf, shuffle=False)
skf

Now, splitting the data into train and test data, using the optimal splitting techniques of K-Fold and Stratified Splitting.

In [None]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    reshaped_y_train = np.asarray(y_train).reshape(-1, 1)
    reshaped_y_test = np.asarray(y_test).reshape(-1, 1)
    
print( 'X_train length: ', len(X_train) ) # To check if splits worked
print( 'y_train length: ', len(y_train) )
print( 'X_test length: ', len(X_test) )
print( 'y_test length: ', len(y_test) )

### Modeling

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from keras.regularizers import l1, l2, l1_l2

# Define the ANN model

#model = Sequential([
#    Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(1, activation='sigmoid')
#])


#model = Sequential([
#    Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(1, activation='sigmoid')
#])

#model = Sequential([
#    Dense(512, kernel_initializer='he_normal', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(256, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(128, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(64, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(32, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(1, activation='sigmoid')
#])

model = Sequential([
    Dense(256, kernel_initializer='he_normal', input_shape=(X_train.shape[1],), kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(128, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
# Compile the model
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.RMSprop(), metrics=['accuracy'])

In [None]:
# Train the model
#model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)
model.fit(X_train, y_train, epochs=20, batch_size=16, validation_split=0.2)

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test set accuracy: {:.2f}".format(accuracy))

In [None]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Make predictions
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate performance metrics
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

# EXPORT MODEL

In [None]:
# Export as SavedModel
tf.saved_model.save(model, '/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/B/SavedModel/')

# Export as Keras Model
model.save("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_B_hd5")

# Export as Keras H5 Model
model.save("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_B_h5.h5")