# SPECTRE TEST
---

## Import Dependencies

In [138]:
import sys
from scapy.all import *
import h5py
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd
from tabulate import tabulate

import seaborn as sns # Graphing, built ontop of MatPlot for ease-of-use and nicer diagrams.
import sklearn # For Machine Learning algorithms
import scikitplot # Confusion matrix plotting
from sklearn.decomposition import PCA # For PCA dimensionality reduction technique
from sklearn.preprocessing import StandardScaler # For scaling to unit scale, before PCA application
from sklearn.preprocessing import LabelBinarizer # For converting categorical data into numeric, for modeling stage
from scikitplot.metrics import plot_confusion_matrix # For plotting confusion matrices
from sklearn.metrics import accuracy_score # For getting the accuracy of a model's predictions
from sklearn.metrics import classification_report # Various metrics for model performance
from sklearn.model_selection import StratifiedKFold # For optimal train_test splitting, for model input data


In [139]:
# KAFKA IMPORT

## Methods

In [140]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep]

In [141]:
def get_PCA_feature_names(num_of_pca_components):
    feature_names = []
    for i in range(num_of_pca_components):    
        feature_names.append(f"Principal component {i+1}")
    return feature_names

**Useful environment variables**

In [142]:
# 'Reduced dimensions' variable for altering the number of PCA principal components. Can be altered for needs.
# Only 7 principal components needed when using non-normalised PCA dataset.
dimensions_num_for_PCA = 7

# Max number of permutations to run. Can be altered for needs.
number_of_permutations = 100

# 10 folds is usually the heuristic to follow for larger datasets of around this size.
num_of_splits_for_skf = 10

# Seed value to pass into models so that repeated runs result in the same output
seed_val = 1

# Number of statistical distance measures to run (for the results, columns section)
num_of_statistical_dist_measures = 6

## Load Model

In [143]:
# Load TF Saved_model
#spectre_model = tf.saved_model.load("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS/SavedModel")

In [144]:
spectre_model_2 = tf.saved_model.load("/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS_2/SavedModel")

#### Load tflite Model

In [145]:
# tflite Model
# Load the TFLite model in TFLite Interpreter
#interpreter = tf.lite.Interpreter('/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS/spectre_ddos_lite.tflite')
#interpreter.allocate_tensors()

# Get input and output tensors.
#input_details = interpreter.get_input_details()
#output_details = interpreter.get_output_details()

In [146]:
#try:
    # Load the H5 model
#    with h5py.File('/home/aryn/spectre-dev/spectre-code/spectre-ann/Model/DDOS/FYP_Finalh5.h5', 'r') as spectre_model:
        # Print metadata
#        print("H5 file metadata:")
#        print("==================")
#        for key, value in spectre_model.attrs.items():
#            print(f"{key}: {value}")
#        print("==================")
#        print("Done!")
#except:
#    print("Error loading H5 file.")

# Bening Data Test
---

## Import Data

### Load the csv file

#### Import Data

In [147]:
Normal_Data = pd.read_csv('../../dataset/CICIDS2017/MachineLearningCSV/MachineLearningCVE/Monday-WorkingHours.pcap_ISCX.csv')
df = Normal_Data.copy()

#### Prepare the data for the model

In [148]:
# Renaming columns and creating a copy
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_cleaned = df.copy()
df_cleaned = clean_dataset(df_cleaned)

# Resetting index and removing unneeded index column
df_cleaned = df_cleaned.reset_index()
df_cleaned.drop('index', axis=1, inplace=True)

# Saving the label attribute before dropping it
df_labels = df_cleaned['label']
df_no_labels = df_cleaned.drop('label', axis=1, inplace=False)
df_features = df_no_labels.columns.tolist()

# Scaling the data
df_scaled = StandardScaler().fit_transform(df_no_labels)
df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

# Performing PCA
pca = PCA(n_components=dimensions_num_for_PCA)
principal_components = pca.fit(df_no_labels).transform(df_no_labels)

# Creating a DataFrame with principal components
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

# Concatenating principal components and labels
df_final = pd.concat([df_pc, df_labels], axis=1)

# Applying LabelBinarizer to the labels
lb = LabelBinarizer()
df_final['label'] = lb.fit_transform(df_final['label'])

# Displaying the final DataFrame
df_final

  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7,label
0,-1.983320e+07,9.251362e+05,1.046049e+06,-128074.061738,-869081.641179,-9328.258030,10210.389224,0
1,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220,0
2,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220,0
3,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220,0
4,-1.984615e+07,1.030431e+06,2.041257e+06,-127666.256450,-866593.042249,-9438.611625,10188.446052,0
...,...,...,...,...,...,...,...,...
529476,-1.978012e+07,6.112687e+05,-1.939024e+06,-133315.868223,-855223.925380,-9350.481582,16850.957232,0
529477,-1.974771e+07,6.161432e+05,-1.935949e+06,-128750.407703,-841842.148056,-9094.307239,10005.865241,0
529478,-1.980753e+07,7.185561e+05,-9.063745e+05,-128879.080536,-873765.673193,-9182.189422,10266.382651,0
529479,-1.981276e+07,7.612775e+05,-5.024661e+05,-128678.971810,-872603.283762,-9206.582296,10228.529420,0


In [149]:
X = df_final.drop(['label'], axis = 1)
y = df_final['label']

In [150]:
y # Label

0         0
1         0
2         0
3         0
4         0
         ..
529476    0
529477    0
529478    0
529479    0
529480    0
Name: label, Length: 529481, dtype: int64

In [151]:
X #Features

Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7
0,-1.983320e+07,9.251362e+05,1.046049e+06,-128074.061738,-869081.641179,-9328.258030,10210.389224
1,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220
2,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220
3,-1.994976e+07,1.872796e+06,1.000291e+07,-124409.821468,-846672.396250,-10321.565566,10014.928220
4,-1.984615e+07,1.030431e+06,2.041257e+06,-127666.256450,-866593.042249,-9438.611625,10188.446052
...,...,...,...,...,...,...,...
529476,-1.978012e+07,6.112687e+05,-1.939024e+06,-133315.868223,-855223.925380,-9350.481582,16850.957232
529477,-1.974771e+07,6.161432e+05,-1.935949e+06,-128750.407703,-841842.148056,-9094.307239,10005.865241
529478,-1.980753e+07,7.185561e+05,-9.063745e+05,-128879.080536,-873765.673193,-9182.189422,10266.382651
529479,-1.981276e+07,7.612775e+05,-5.024661e+05,-128678.971810,-872603.283762,-9206.582296,10228.529420


## Predection

### Model Prediction

### Use the model to predict anomalies

In [152]:
#model = spectre_model

In [153]:
#predictions = spectre_model_2_h5.predict(training_sample)

In [154]:
infer = spectre_model_2.signatures['serving_default']

In [155]:
# Prepare input data
input_name = list(infer.structured_input_signature[1].keys())[0]
input_data = {input_name: tf.convert_to_tensor(X, dtype=tf.float32)}

In [156]:
# Get the output name
output_name = list(infer.structured_outputs.keys())[0]

In [157]:
# Perform the inference
predictions = infer(**input_data)

In [158]:
# Calculate the detection rate
predicted_labels = tf.argmax(predictions[output_name], axis=1).numpy()
detection_rate_bening = accuracy_score(y, predicted_labels)

### Results

In [159]:
print("Detection rate:", detection_rate_bening)

Detection rate: 1.0



# Mixed Data Test
---

## Import Data

### Load the csv file

#### Import Data

In [160]:
DDoS_Data = pd.read_csv('../../dataset/CICIDS2017/MachineLearningCSV/MachineLearningCVE/Wednesday-workingHours.pcap_ISCX.csv')
df = DDoS_Data.copy()

#### Prepare the data for the model

In [161]:
# Renaming columns and creating a copy
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_cleaned = df.copy()
df_cleaned = clean_dataset(df_cleaned)

# Resetting index and removing unneeded index column
df_cleaned = df_cleaned.reset_index()
df_cleaned.drop('index', axis=1, inplace=True)

# Saving the label attribute before dropping it
df_labels = df_cleaned['label']
df_no_labels = df_cleaned.drop('label', axis=1, inplace=False)
df_features = df_no_labels.columns.tolist()

# Scaling the data
df_scaled = StandardScaler().fit_transform(df_no_labels)
df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

# Performing PCA
pca = PCA(n_components=dimensions_num_for_PCA)
principal_components = pca.fit(df_no_labels).transform(df_no_labels)

# Creating a DataFrame with principal components
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

# Concatenating principal components and labels
df_final = pd.concat([df_pc, df_labels], axis=1)

# Applying LabelBinarizer to the labels
lb = LabelBinarizer()
df_final['label'] = lb.fit_transform(df_final['label'])

# Displaying the final DataFrame
df_final

  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7,label
0,-6.719272e+07,-2.929021e+06,-2.424908e+06,-3.683649e+06,-561111.450909,-208436.017784,-103218.535400,1
1,-6.723308e+07,-2.938733e+06,-1.385765e+06,-3.675577e+06,-571210.298004,-221106.058894,-138282.114033,1
2,-6.727888e+07,-2.991829e+06,3.326729e+06,-3.580775e+06,-572868.113070,-218961.579517,-138358.400024,1
3,-6.719725e+07,-2.922039e+06,-1.759606e+06,-3.683482e+06,-583282.686918,-213177.367255,-138286.979942,1
4,-6.727859e+07,-2.992425e+06,3.344391e+06,-3.580586e+06,-573737.024023,-219025.394020,-138508.264032,1
...,...,...,...,...,...,...,...,...
691401,-6.717165e+07,-2.930343e+06,-2.416930e+06,-3.676398e+06,-566880.587954,-198734.217630,-133002.753024,1
691402,-6.723704e+07,-2.942852e+06,-1.049073e+06,-3.668821e+06,-570693.215727,-220903.867635,-138167.009269,1
691403,-6.722712e+07,-2.933071e+06,-1.974014e+06,-3.687865e+06,-572080.499889,-221436.159363,-138444.078091,1
691404,-6.560133e+07,-2.975964e+06,-2.423091e+06,-2.989848e+06,-423257.878489,502947.163425,-6804.331807,1


In [162]:
X = df_final.drop(['label'], axis = 1)
y = df_final['label']

In [163]:
y # Label

0         1
1         1
2         1
3         1
4         1
         ..
691401    1
691402    1
691403    1
691404    1
691405    1
Name: label, Length: 691406, dtype: int64

In [164]:
X #Features

Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7
0,-6.719272e+07,-2.929021e+06,-2.424908e+06,-3.683649e+06,-561111.450909,-208436.017784,-103218.535400
1,-6.723308e+07,-2.938733e+06,-1.385765e+06,-3.675577e+06,-571210.298004,-221106.058894,-138282.114033
2,-6.727888e+07,-2.991829e+06,3.326729e+06,-3.580775e+06,-572868.113070,-218961.579517,-138358.400024
3,-6.719725e+07,-2.922039e+06,-1.759606e+06,-3.683482e+06,-583282.686918,-213177.367255,-138286.979942
4,-6.727859e+07,-2.992425e+06,3.344391e+06,-3.580586e+06,-573737.024023,-219025.394020,-138508.264032
...,...,...,...,...,...,...,...
691401,-6.717165e+07,-2.930343e+06,-2.416930e+06,-3.676398e+06,-566880.587954,-198734.217630,-133002.753024
691402,-6.723704e+07,-2.942852e+06,-1.049073e+06,-3.668821e+06,-570693.215727,-220903.867635,-138167.009269
691403,-6.722712e+07,-2.933071e+06,-1.974014e+06,-3.687865e+06,-572080.499889,-221436.159363,-138444.078091
691404,-6.560133e+07,-2.975964e+06,-2.423091e+06,-2.989848e+06,-423257.878489,502947.163425,-6804.331807


## Predection

### Model Prediction

### Use the model to predict anomalies

In [165]:
#model = spectre_model

In [166]:
#predictions = spectre_model_2_h5.predict(training_sample)

In [167]:
infer = spectre_model_2.signatures['serving_default']

In [168]:
# Prepare input data
input_name = list(infer.structured_input_signature[1].keys())[0]
input_data = {input_name: tf.convert_to_tensor(X, dtype=tf.float32)}

In [169]:
# Get the output name
output_name = list(infer.structured_outputs.keys())[0]

In [170]:
# Perform the inference
predictions = infer(**input_data)

In [171]:
# Calculate the detection rate
predicted_labels = tf.argmax(predictions[output_name], axis=1).numpy()
detection_rate_mixed = accuracy_score(y, predicted_labels)

### Results

In [172]:
print("Detection rate:", detection_rate_mixed)

Detection rate: 0.3640740751454283


---


# Training Data Test
---

## Import Data

### Load the csv file

#### Import Data

In [173]:
DDoS_Data = pd.read_csv('../../dataset/CICIDS2017/MachineLearningCSV/MachineLearningCVE/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv')
df = DDoS_Data.copy()

#### Prepare the data for the model

In [174]:
# Renaming columns and creating a copy
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df_cleaned = df.copy()
df_cleaned = clean_dataset(df_cleaned)

# Resetting index and removing unneeded index column
df_cleaned = df_cleaned.reset_index()
df_cleaned.drop('index', axis=1, inplace=True)

# Saving the label attribute before dropping it
df_labels = df_cleaned['label']
df_no_labels = df_cleaned.drop('label', axis=1, inplace=False)
df_features = df_no_labels.columns.tolist()

# Scaling the data
df_scaled = StandardScaler().fit_transform(df_no_labels)
df_scaled = pd.DataFrame(data=df_scaled, columns=df_features)

# Performing PCA
pca = PCA(n_components=dimensions_num_for_PCA)
principal_components = pca.fit(df_no_labels).transform(df_no_labels)

# Creating a DataFrame with principal components
principal_component_headings = get_PCA_feature_names(dimensions_num_for_PCA)
df_pc = pd.DataFrame(data=principal_components, columns=principal_component_headings)

# Concatenating principal components and labels
df_final = pd.concat([df_pc, df_labels], axis=1)

# Applying LabelBinarizer to the labels
lb = LabelBinarizer()
df_final['label'] = lb.fit_transform(df_final['label'])

# Displaying the final DataFrame
df_final

  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)


Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7,label
0,-3.632317e+07,-177588.423798,-158328.262332,3.274166e+06,-644672.422205,-65463.389477,-2.488349e+06,0
1,-3.630598e+07,-175956.557639,-146308.360104,-6.182039e+05,-660559.724997,-66403.655703,-2.500578e+06,0
2,-3.630655e+07,-176002.193903,-146668.436861,-4.974704e+05,-660076.496646,-66383.896019,-2.500178e+06,0
3,-3.630711e+07,-176054.438478,-147043.488746,-3.752421e+05,-659575.485264,-66353.363288,-2.499773e+06,0
4,-3.632317e+07,-177588.424395,-158328.262578,3.274166e+06,-644672.421279,-65463.388671,-2.488349e+06,0
...,...,...,...,...,...,...,...,...
225706,-3.630640e+07,-175986.939817,-146564.360752,-5.315339e+05,-660217.323093,-66393.258881,-2.500293e+06,0
225707,-3.630626e+07,-175975.278900,-146473.928435,-5.616025e+05,-660338.262449,-66398.743473,-2.500392e+06,0
225708,-3.630622e+07,-175972.744882,-146454.001685,-5.682723e+05,-660364.983726,-66399.860056,-2.500414e+06,0
225709,-3.630660e+07,-176009.636226,-146737.014042,-4.781892e+05,-659998.750229,-66339.920855,-2.500141e+06,0


In [175]:
X = df_final.drop(['label'], axis = 1)
y = df_final['label']

In [176]:
y # Label

0         0
1         0
2         0
3         0
4         0
         ..
225706    0
225707    0
225708    0
225709    0
225710    0
Name: label, Length: 225711, dtype: int64

In [177]:
X #Features

Unnamed: 0,Principal component 1,Principal component 2,Principal component 3,Principal component 4,Principal component 5,Principal component 6,Principal component 7
0,-3.632317e+07,-177588.423798,-158328.262332,3.274166e+06,-644672.422205,-65463.389477,-2.488349e+06
1,-3.630598e+07,-175956.557639,-146308.360104,-6.182039e+05,-660559.724997,-66403.655703,-2.500578e+06
2,-3.630655e+07,-176002.193903,-146668.436861,-4.974704e+05,-660076.496646,-66383.896019,-2.500178e+06
3,-3.630711e+07,-176054.438478,-147043.488746,-3.752421e+05,-659575.485264,-66353.363288,-2.499773e+06
4,-3.632317e+07,-177588.424395,-158328.262578,3.274166e+06,-644672.421279,-65463.388671,-2.488349e+06
...,...,...,...,...,...,...,...
225706,-3.630640e+07,-175986.939817,-146564.360752,-5.315339e+05,-660217.323093,-66393.258881,-2.500293e+06
225707,-3.630626e+07,-175975.278900,-146473.928435,-5.616025e+05,-660338.262449,-66398.743473,-2.500392e+06
225708,-3.630622e+07,-175972.744882,-146454.001685,-5.682723e+05,-660364.983726,-66399.860056,-2.500414e+06
225709,-3.630660e+07,-176009.636226,-146737.014042,-4.781892e+05,-659998.750229,-66339.920855,-2.500141e+06


## Predection

### Model Prediction

### Use the model to predict anomalies

In [178]:
#model = spectre_model

In [179]:
#predictions = spectre_model_2_h5.predict(training_sample)

In [180]:
infer = spectre_model_2.signatures['serving_default']

In [181]:
# Prepare input data
input_name = list(infer.structured_input_signature[1].keys())[0]
input_data = {input_name: tf.convert_to_tensor(X, dtype=tf.float32)}

In [182]:
# Get the output name
output_name = list(infer.structured_outputs.keys())[0]

In [183]:
# Perform the inference
predictions = infer(**input_data)

In [184]:
# Calculate the detection rate
predicted_labels = tf.argmax(predictions[output_name], axis=1).numpy()
detection_rate_train = accuracy_score(y, predicted_labels)

### Results

In [185]:
print("Detection rate:", detection_rate_train)

Detection rate: 0.4327923760915507


# Results

Beninge Detection Rate

In [186]:
print("Detection rate:", detection_rate_bening)

Detection rate: 1.0


Mixed Detection Rate

In [187]:
print("Detection rate:", detection_rate_mixed)

Detection rate: 0.3640740751454283


Training Dataset Test

In [188]:
print("Detection rate:", detection_rate_train)

Detection rate: 0.4327923760915507
