# SPECTRE-CPU-V2
> Trained with **CICIDS2017**

# SETUP PRE-REQUISITES

In [9]:
import os
import platform
import sys
import glob

import tensorflow as tf
#from tensorflow.keras import layers

#import keras
from tensorflow import keras
from keras import layers
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
from keras.applications.vgg16 import VGG16
from keras.applications.vgg19 import VGG19
from keras.layers import Dense, Flatten, Input
from keras.models import Model
from keras import layers, models

import pandas as pd

#from tabulate import tabulate

import seaborn as sns # Graphing, built ontop of MatPlot for ease-of-use and nicer diagrams.
import sklearn # For Machine Learning algorithms
import scikitplot # Confusion matrix plotting
from sklearn.decomposition import PCA # For PCA dimensionality reduction technique
from sklearn.preprocessing import StandardScaler # For scaling to unit scale, before PCA application
from sklearn.preprocessing import LabelBinarizer # For converting categorical data into numeric, for modeling stage
from sklearn.model_selection import StratifiedKFold # For optimal train_test splitting, for model input data
from sklearn.model_selection import train_test_split # For basic dataset splitting
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbors ML classifier (default n. of neighbors = 5)
from scikitplot.metrics import plot_confusion_matrix # For plotting confusion matrices
from sklearn.metrics import accuracy_score # For getting the accuracy of a model's predictions
from sklearn.metrics import classification_report # Various metrics for model performance
from sklearn.neural_network import MLPClassifier # For Neural Network classifier
from sklearn.linear_model import LogisticRegression

def escape():
    sys.exit()

In [10]:
print(f"Python Platform: {platform.platform()}")
print(f"Tensor Flow Version: {tf.__version__}")
print(f"Keras Version: {tf.keras.__version__}")
print()
print(f"Python {sys.version}")

Python Platform: Linux-5.15.90.1-microsoft-standard-WSL2-x86_64-with-glibc2.29
Tensor Flow Version: 2.12.0
Keras Version: 2.12.0

Python 3.8.10 (default, Mar 13 2023, 10:26:41) 
[GCC 9.4.0]


## ENVIRONMENT SETUP

**Setup INFO level**

In [11]:
tf.get_logger().setLevel('INFO')

#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # or any {'0', '1', '2'}

**Useful environment variables**

In [12]:
# Max number of permutations to run. Can be altered for needs.
number_of_permutations = 100

# 10 folds is usually the heuristic to follow for larger datasets of around this size.
num_of_splits_for_skf = 10

# Seed value to pass into models so that repeated runs result in the same output
seed_val = 1

# Number of statistical distance measures to run (for the results, columns section)
num_of_statistical_dist_measures = 6

## Data Prepocessing

In [None]:
# 'Reduced dimensions' variable for altering the number of PCA principal components. Can be altered for needs.
# Only 7 principal components needed when using non-normalised PCA dataset.
dimensions_num_for_PCA = 7

def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df = df.replace([np.inf, -np.inf], np.nan)  # Replace np.inf and -np.inf with np.nan
    df.dropna(inplace=True)  # Drop rows containing np.nan
    return df

def get_PCA_feature_names(num_of_pca_components):
    feature_names = []
    for i in range(num_of_pca_components):    
        feature_names.append(f"Principal component {i+1}")
    return feature_names

# Renaming columns and creating a copy
df = DDoS_Kaggle.copy()
df = df.rename(columns=lambda x: x.strip().lower().replace(' ', '_').replace('(', '').replace(')', ''))
df_cleaned = clean_dataset(df).compute()

# Resetting index and removing unneeded index column
df_cleaned.reset_index(drop=True, inplace=True)

# Saving the label attribute before dropping it
df_labels = df_cleaned['label']
df_no_labels = df_cleaned.drop('label', axis=1)
df_features = df_no_labels.columns.tolist()

# Scaling the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_no_labels)
df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

# Performing PCA
dimensions_num_for_PCA = 2  # You need to define dimensions_num_for_PCA
pca = PCA(n_components=dimensions_num_for_PCA)
principal_components = pca.fit_transform(df_scaled)

# Creating a DataFrame with principal components
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

# Concatenating principal components and labels
df_final = pd.concat([df_pc, df_labels], axis=1)

# Applying LabelBinarizer to the labels
lb = LabelBinarizer()
df_final['label'] = lb.fit_transform(df_final['label'])

# Displaying the final DataFrame
df_final

## Training

### Dataset Splitting

**K-Fold Cross Validation and Stratified splitting**

K-Fold is a technique which splits data into K folds (splits). Train of a model K times, and for each training iteration, K-Fold selects a different fold to use for testing; the remaining K - 1 folds become the training data. Typically, the optimal K value can be derived using the size of your dataset (num of rows). Ideally, each fold should be statistically representative of the population. Too small and it won't be useful. Too large, and you lose the positives from doing K-Fold.

You can use Stratified splitting with K-Fold, which ensures balance between some criteria (balances out the classes) e.g. equal portion of label classes in each fold.

Class Imbalance is a significant issue in the ML/ Data Mining domain. It leads to incorrect results e.g. if one fold had all of 1 label (accidentally), then it would produce terrible predictive results as it wouldn't know what the other label class data point would look like. You can only work with the data you have, so this has to be dealt with.

Benefits of K-Fold:
- Use more of the data towards making a succesful model.
- Obtain K models to evaluate, can improve the confidence that you have selected an appropriate model algorithm and cleaned/ prepared the data correctly, e.g. normal split with 1 model, one doesn't know if it's good or not- it could be heavily biased. Multiple models ensures less bias and increased variance.
- Looking at the accuracy results from each of the k-Folds, you can identify data issues e.g. a certain fold performs really badly. Could this suggest that more cleaning is required? Maybe the data preparation was performed incorrectly?
- If all folds return similar accuracies, one can be more confident that a deployed model will perform similarly to how one expects.

Issues with K-Fold:
- Creating K separate models requires more computation.
- If you haven't got much data, you might not get many folds. Less folds means K-Fold loses its benefits.
- If K is very large, each fold is small, and harder to ensure statistical distribution of.
- Choosing the best of K models introduces bias. Real world data could perform better under a more general, lower performing model.

Code reference: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html

In [31]:
# Separating the label so that the answers aren't provided to the model, in training.
X = df_final.drop(['label'], axis = 1)
y = df_final['label']
y


0         0
1         0
2         0
3         0
4         0
         ..
225706    0
225707    0
225708    0
225709    0
225710    0
Name: label, Length: 225711, dtype: int64

In [32]:
X

Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7
0,-3.632317e+07,-177588.423798,-158328.262332,3.274166e+06,-644672.422205,-65463.389477,-2.488349e+06
1,-3.630598e+07,-175956.557639,-146308.360104,-6.182039e+05,-660559.724997,-66403.655703,-2.500578e+06
2,-3.630655e+07,-176002.193903,-146668.436861,-4.974704e+05,-660076.496646,-66383.896019,-2.500178e+06
3,-3.630711e+07,-176054.438478,-147043.488746,-3.752421e+05,-659575.485264,-66353.363288,-2.499773e+06
4,-3.632317e+07,-177588.424395,-158328.262578,3.274166e+06,-644672.421279,-65463.388671,-2.488349e+06
...,...,...,...,...,...,...,...
225706,-3.630640e+07,-175986.939817,-146564.360752,-5.315339e+05,-660217.323093,-66393.258881,-2.500293e+06
225707,-3.630626e+07,-175975.278900,-146473.928435,-5.616025e+05,-660338.262449,-66398.743473,-2.500392e+06
225708,-3.630622e+07,-175972.744882,-146454.001685,-5.682723e+05,-660364.983726,-66399.860056,-2.500414e+06
225709,-3.630660e+07,-176009.636226,-146737.014042,-4.781892e+05,-659998.750229,-66339.920855,-2.500141e+06


In [33]:
skf = StratifiedKFold(n_splits=num_of_splits_for_skf, shuffle=False)
skf

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

Now, splitting the data into train and test data, using the optimal splitting techniques of K-Fold and Stratified Splitting.

In [34]:
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    reshaped_y_train = np.asarray(y_train).reshape(-1, 1)
    reshaped_y_test = np.asarray(y_test).reshape(-1, 1)
    
print( 'X_train length: ', len(X_train) ) # To check if splits worked
print( 'y_train length: ', len(y_train) )
print( 'X_test length: ', len(X_test) )
print( 'y_test length: ', len(y_test) )

X_train length:  203140
y_train length:  203140
X_test length:  22571
y_test length:  22571


### Modeling

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, LeakyReLU
from keras.regularizers import l1, l2, l1_l2

# Define the ANN model

#model = Sequential([
#    Dense(128, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(1, activation='sigmoid')
#])


#model = Sequential([
#    Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
#    BatchNormalization(),
#    Dropout(0.5),
#    Dense(1, activation='sigmoid')
#])

#model = Sequential([
#    Dense(512, kernel_initializer='he_normal', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(256, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(128, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(64, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(32, kernel_initializer='he_normal', kernel_regularizer=l2(0.001)),
#    LeakyReLU(alpha=0.1),
#    BatchNormalization(),
#    Dropout(0.4),
#    Dense(1, activation='sigmoid')
#])

model = Sequential([
    Dense(256, kernel_initializer='he_normal', input_shape=(X_train.shape[1],), kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(128, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, kernel_initializer='he_normal', kernel_regularizer=l1_l2(l1=0.0001, l2=0.0001)),
    LeakyReLU(alpha=0.1),
    BatchNormalization(),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

2023-05-24 09:37:01.985835: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-24 09:37:02.008707: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-24 09:37:02.008913: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-24 09:37:02.010070: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-24 09:37:02.010256: I tensorflow/compile

In [37]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               2048      
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 256)               0         
                                                                 
 batch_normalization (BatchN  (None, 256)              1024      
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 128)               0         
                                                        

In [38]:
# Compile the model
#model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.RMSprop(), metrics=['accuracy'])

In [39]:
# Train the model
#model.fit(X_train, y_train, epochs=10, batch_size=16, validation_split=0.2)
model.fit(X_train, y_train, epochs=50, batch_size=16, validation_split=0.2)

Epoch 1/50


2023-05-24 09:37:07.690526: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:637] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2023-05-24 09:37:08.274363: I tensorflow/compiler/xla/service/service.cc:169] XLA service 0x7f05371d3260 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-05-24 09:37:08.274408: I tensorflow/compiler/xla/service/service.cc:177]   StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6
2023-05-24 09:37:08.304692: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-05-24 09:37:08.571354: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:424] Loaded cuDNN version 8600
2023-05-24 09:37:08.778398: I ./tensorflow/compiler/jit/device_compiler.h:180] Compiled cluster using XLA!  This line is logged at most once for the lifeti

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f0669065940>

In [40]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Test set accuracy: {:.2f}".format(accuracy))

Test set accuracy: 0.96


In [41]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Make predictionsHDF5
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

# Calculate performance metrics
conf_matrix = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Confusion Matrix:\n", conf_matrix)
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))

Confusion Matrix:
 [[18409  1010]
 [  789 24935]]
Precision: 0.96
Recall: 0.97
F1-score: 0.97


# EXPORT MODEL

In [42]:
# Export as SavedModel
tf.saved_model.save(model, '/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/SavedModel/')

# Export as Keras Model
model.save("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_A_hd5")

# Export as Keras H5 Model
model.save("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_A_h5.h5")

2023-05-24 11:46:15.332324: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,256]
	 [[{{node inputs}}]]
2023-05-24 11:46:15.348098: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,128]
	 [[{{node inputs}}]]
2023-05-24 11:46:15.363765: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-05-24 11:46:

INFO:tensorflow:Assets written to: /home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/SavedModel/assets


2023-05-24 11:46:17.193123: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,256]
	 [[{{node inputs}}]]
2023-05-24 11:46:17.209650: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,128]
	 [[{{node inputs}}]]
2023-05-24 11:46:17.224186: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype float and shape [?,64]
	 [[{{node inputs}}]]
2023-05-24 11:46:

INFO:tensorflow:Assets written to: /home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/A/spectre_ddos_2_A_hd5/assets
