In [None]:
import pandas as pd
import numpy as np
import warnings
import tracemalloc
import time

from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, f1_score, classification_report, roc_auc_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import BernoulliRBM

import tensorflow as tf
from tensorflow import keras
from keras.models import Model, Sequential
from keras import layers
from keras.layers import Input, Dense, ReLU, Normalization, LSTM, GRU, Reshape, TimeDistributed, Conv1D, Dropout, Flatten, Bidirectional, LeakyReLU, SimpleRNN,BatchNormalization
from keras.activations import tanh
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


warnings.filterwarnings("ignore")

Num GPUs Available:  1


In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',    # Monitors the training loss
    patience=5,        # Number of epochs with no improvement after which training will be stopped
    min_delta=0.0001,       # Minimum change in the monitored quantity to qualify as an improvement
    mode='min',        # Mode 'min' because we want to monitor the decrease in loss
    verbose=1          # Verbosity mode
)

# **NSL-KDD DATASET**

In [None]:
#LOAD NSL-KDD DATASET
data1 = pd.read_csv('/content/drive/MyDrive/NSL-KDD Dataset/KDDTrain+.txt')
data2 = pd.read_csv('/content/drive/MyDrive/NSL-KDD Dataset/KDDTest+.txt')
columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in','num_compromised','root_shell','su_attempted'
,'num_root','num_file_creations','num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate'
,'srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate','dst_host_srv_diff_host_rate','dst_host_serror_rate',
'dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','attack','level'])

data1.columns = columns
data2.columns = columns
data = data1.append(data2)
data = data.drop('level', axis=1)
display(data)

y = data[['attack']]
print(y.attack.value_counts())
x = data.drop(['attack'], axis=1)
x.su_attempted.replace(2, 0, inplace=True)


def change_label(df):
    df.attack.replace(['apache2', 'back', 'land', 'neptune', 'mailbomb', 'pod', 'processtable', 'smurf', 'teardrop',
                       'udpstorm', 'worm'], 1, inplace=True)
    df.attack.replace(['ftp_write', 'guess_passwd', 'httptunnel', 'imap', 'multihop', 'named', 'phf', 'sendmail',
                       'snmpgetattack', 'snmpguess', 'spy', 'warezclient', 'warezmaster', 'xlock', 'xsnoop'], 1, inplace=True)
    df.attack.replace(['ipsweep', 'mscan', 'nmap', 'portsweep', 'saint', 'satan'], 1, inplace=True)
    df.attack.replace(['buffer_overflow', 'loadmodule', 'perl', 'ps', 'rootkit', 'sqlattack', 'xterm'], 1, inplace=True)
    df.attack.replace(['normal'], 0, inplace=True)
    return df
change_label(y)
y.attack.value_counts()

categorical_features = ['protocol_type', 'service', 'flag']
binary_features = ['logged_in', 'is_guest_login', 'is_host_login', 'land','root_shell','su_attempted']
column_to_drop = ['num_outbound_cmds']
x_categorical = x[categorical_features]
x_binary = x[binary_features]
x_continuous = x.drop(categorical_features + binary_features + column_to_drop, axis=1)

#SCALE CONTINUOUS
continuous_features = x_continuous.columns
scaler = MinMaxScaler(feature_range=(0, 1))
x_continuous_scaled = scaler.fit_transform(x_continuous)
x_continuous_scaled = pd.DataFrame(x_continuous_scaled, columns=x_continuous.columns)

#ENCODE CATEGORICAL
encoder = OneHotEncoder(sparse=False)
x_categorical_encoded = encoder.fit_transform(x_categorical)
encoded_columns = encoder.get_feature_names_out(x_categorical.columns)
x_categorical_encoded = pd.DataFrame(x_categorical_encoded, columns=encoded_columns)

# Resetting indexes of each DataFrame
x_categorical_encoded = x_categorical_encoded.reset_index(drop=True)
x_continuous_scaled = x_continuous_scaled.reset_index(drop=True)
x_binary = x_binary.reset_index(drop=True)
y = y.reset_index(drop=True)

#Combine Back Together
processed_dataset = pd.concat([x_categorical_encoded, x_continuous_scaled, x_binary, y], axis=1)

#Train_Test_Split for supervised data
y_supervised = processed_dataset['attack']
x_supervised = processed_dataset.drop(['attack'], axis=1)
x_supervised_train, x_supervised_test, y_supervised_train, y_supervised_test = train_test_split(x_supervised,y_supervised, test_size=0.2, random_state=42)
x_supervised_val, x_supervised_val, y_supervised_val, y_supervised_val = train_test_split(x_supervised_train, y_supervised_train, test_size=0.2, random_state=42)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack
0,0,udp,other,SF,146,0,0,0,0,0,...,1,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal
1,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune
2,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal
3,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal
4,0,tcp,private,REJ,0,0,0,0,0,0,...,19,0.07,0.07,0.00,0.00,0.00,0.00,1.00,1.00,neptune
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,tcp,smtp,SF,794,333,0,0,0,0,...,141,0.72,0.06,0.01,0.01,0.01,0.00,0.00,0.00,normal
22539,0,tcp,http,SF,317,938,0,0,0,0,...,255,1.00,0.00,0.01,0.01,0.01,0.00,0.00,0.00,normal
22540,0,tcp,http,SF,54540,8314,0,0,0,2,...,255,1.00,0.00,0.00,0.00,0.00,0.00,0.07,0.07,back
22541,0,udp,domain_u,SF,42,42,0,0,0,0,...,252,0.99,0.01,0.00,0.00,0.00,0.00,0.00,0.00,normal


normal             77053
neptune            45870
satan               4368
ipsweep             3740
smurf               3311
portsweep           3088
nmap                1566
back                1315
guess_passwd        1284
mscan                996
warezmaster          964
teardrop             904
warezclient          890
apache2              737
processtable         685
snmpguess            331
saint                319
mailbomb             293
pod                  242
snmpgetattack        178
httptunnel           133
buffer_overflow       50
land                  25
multihop              25
rootkit               23
named                 17
ps                    15
sendmail              14
xterm                 13
imap                  12
loadmodule            11
ftp_write             11
xlock                  9
phf                    6
perl                   5
xsnoop                 4
spy                    2
worm                   2
sqlattack              2
udpstorm               2


## Additional Processing for AutoEncoder

In [None]:
processed_dataset_scaled = scaler.fit_transform(processed_dataset)
processed_dataset_scaled = pd.DataFrame(processed_dataset_scaled, columns = processed_dataset.columns)

#Split into training and testing for unsupervised
x_train, x_test= train_test_split(processed_dataset, test_size=0.2, random_state=42)

print('X_TRAIN:\n', x_train.attack.value_counts())
print('X_TEST:\n', x_test.attack.value_counts())

normal_data_train=x_train[x_train['attack'] == 0]
normal_data_train=normal_data_train.drop(['attack'], axis = 1)

abnormal_data_train = x_train[x_train['attack'] == 1]
abnormal_data_train=abnormal_data_train.drop(['attack'], axis = 1)

normal_data_train, normal_data_val = train_test_split(normal_data_train, test_size=0.2, random_state=42)
abnormal_data_train, abnormal_data_val = train_test_split(abnormal_data_train, test_size=0.2, random_state=42)

data_test=x_test.drop(['attack'], axis=1)


X_TRAIN:
 0    61696
1    57116
Name: attack, dtype: int64
X_TEST:
 0    15357
1    14346
Name: attack, dtype: int64


# **5G NIDD DATASET**

In [None]:
#5G NIDD DATASET
dataset = pd.read_csv('/content/drive/MyDrive/AnomalyDetectionDataset/5G-NIDD.csv')
columns_to_drop = [col for col in dataset if dataset[col].nunique() == 1]
redundant_columns = ['Unnamed: 0', 'RunTime', 'Mean', 'Sum', 'Min', 'Max']
too_much_nan_columns = ['dTos', 'dDSb', 'dTtl', 'dHops', 'SrcGap', 'DstGap', 'SrcWin', 'DstWin', 'sVid', 'dVid', 'SrcTCPBase', 'DstTCPBase']
columns_to_drop.extend(redundant_columns)
columns_to_drop.extend(too_much_nan_columns)
dataset.drop(columns=columns_to_drop, axis=1, inplace=True)
remaining_nan_columns = ['sTos', 'sTtl', 'sHops']

for col in remaining_nan_columns:
  col_mean = dataset[col].mean()
  dataset[col].fillna(col_mean, inplace=True)

y = dataset[['Label']]
y.Label.replace(['Benign'], 0, inplace=True)
y.Label.replace(['Malicious'], 1, inplace=True)
print(y.Label.value_counts())
x = dataset.drop(['Label', 'Attack Type', 'Attack Tool'], axis=1)

categorical_features = ['Cause', 'Proto', 'State', 'sDSb']
x_categorical = x[categorical_features]
x_continuous = x.drop(categorical_features, axis=1)

#SCALE CONTINUOUS
continuous_features = x_continuous.columns
scaler = MinMaxScaler(feature_range=(0, 1))
x_continuous_scaled = scaler.fit_transform(x_continuous)
x_continuous_scaled = pd.DataFrame(x_continuous_scaled, columns=x_continuous.columns)
#ENCODE CATEGORICAL
encoder = OneHotEncoder(sparse=False)
x_categorical_encoded = encoder.fit_transform(x_categorical)
encoded_columns = encoder.get_feature_names_out(x_categorical.columns)
x_categorical_encoded = pd.DataFrame(x_categorical_encoded, columns=encoded_columns)

#Combine Back Together
processed_dataset = pd.concat([x_categorical_encoded, x_continuous_scaled,y], axis=1)

#Train_Test_Split for supervised data
y_supervised = processed_dataset['Label']
x_supervised = processed_dataset.drop(['Label'], axis=1)
x_supervised_train, x_supervised_test, y_supervised_train, y_supervised_test = train_test_split(x_supervised,y_supervised, test_size=0.2, random_state=42)
x_supervised_val, x_supervised_val, y_supervised_val, y_supervised_val = train_test_split(x_supervised_train, y_supervised_train, test_size=0.2, random_state=42)


1    738153
0    477737
Name: Label, dtype: int64


In [None]:
processed_dataset_scaled = scaler.fit_transform(processed_dataset)
processed_dataset_scaled = pd.DataFrame(processed_dataset_scaled, columns = processed_dataset.columns)

#Split into training and testing for unsupervised
x_train, x_test= train_test_split(processed_dataset, test_size=0.2, random_state=42)

print('X_TRAIN:\n', x_train.Label.value_counts())
print('X_TEST:\n', x_test.Label.value_counts())

normal_data_train=x_train[x_train['Label'] == 0]
normal_data_train=normal_data_train.drop(['Label'], axis = 1)

abnormal_data_train = x_train[x_train['Label'] == 1]
abnormal_data_train=abnormal_data_train.drop(['Label'], axis = 1)

normal_data_train, normal_data_val = train_test_split(normal_data_train, test_size=0.2, random_state=42)
abnormal_data_train, abnormal_data_val = train_test_split(abnormal_data_train, test_size=0.2, random_state=42)

data_test=x_test.drop(['Label'], axis=1)

X_TRAIN:
 1    590769
0    381943
Name: Label, dtype: int64
X_TEST:
 1    147384
0     95794
Name: Label, dtype: int64


# CICIDS2017 Dataset

In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/AnomalyDetectionDataset/CIC-IDS2017.csv')


In [None]:
dataset

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,4,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,192.168.10.5-8.254.250.126-49188-80-6,8.254.250.126,80,192.168.10.5,49188,6,03/07/2017 08:55:58,1,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,192.168.10.14-8.253.185.121-49486-80-6,8.253.185.121,80,192.168.10.14,49486,6,03/07/2017 08:56:22,3,2,0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604993,192.168.10.16-199.244.48.55-41926-443-6,192.168.10.16,41926,199.244.48.55,443,6,7/7/2017 3:29,196135,49,57,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2604994,192.168.10.16-199.244.48.55-41934-443-6,192.168.10.16,41934,199.244.48.55,443,6,7/7/2017 3:29,378424,49,59,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2604995,192.168.10.16-199.244.48.55-41932-443-6,192.168.10.16,41932,199.244.48.55,443,6,7/7/2017 3:29,161800,70,103,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2604996,192.168.10.16-199.244.48.55-41930-443-6,192.168.10.16,41930,199.244.48.55,443,6,7/7/2017 3:29,142864,50,62,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [None]:
def change_label(df):
    df.Label.replace(['DoS Hulk', 'DoS GoldenEye', 'DoS slowloris', 'DoS Slowhttptest'], 1, inplace=True)
    df.Label.replace(['FTP-Patator', 'SSH-Patator'], 1, inplace=True)
    df.Label.replace(['Web Attack  XSS', 'Web Attack  XSS', 'Web Attack  Sql Injection', 'Web Attack  Brute Force', 'Heartbleed', 'Infiltration', 'Bot', 'PortScan'], 1, inplace=True)
    df.Label.replace(['BENIGN'], 0, inplace=True)
    return df
data = change_label(dataset)
data.Label.unique()

data.drop(columns=['Flow ID', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Timestamp', 'Protocol'], inplace=True)
data_colunns=data.columns

data = data.replace([np.inf, -np.inf], np.nan)
data = data.dropna()

scaler = MinMaxScaler()
data = scaler.fit_transform(data)
data = pd.DataFrame(data, columns=data_colunns)

y = data[['Label']]
print(y.Label.value_counts())
x = data.drop(['Label'], axis=1)

#Train_Test_Split for supervised data
y_supervised = y
x_supervised = x
x_supervised_train, x_supervised_test, y_supervised_train, y_supervised_test = train_test_split(x_supervised,y_supervised, test_size=0.2, random_state=42)
x_supervised_val, x_supervised_val, y_supervised_val, y_supervised_val = train_test_split(x_supervised_train, y_supervised_train, test_size=0.2, random_state=42)

0.0    2173634
1.0     428531
Name: Label, dtype: int64


## Additional Processing for AutoEncoder

In [None]:
#Split into training and testing for unsupervised
x_train, x_test= train_test_split(data, test_size=0.2, random_state=42)

print('X_TRAIN:\n', x_train.Label.value_counts())
print('X_TEST:\n', x_test.Label.value_counts())

normal_data_train=x_train[x_train['Label'] == 0]
normal_data_train=normal_data_train.drop(['Label'], axis = 1)

abnormal_data_train = x_train[x_train['Label'] == 1]
abnormal_data_train=abnormal_data_train.drop(['Label'], axis = 1)

normal_data_train, normal_data_val = train_test_split(normal_data_train, test_size=0.2, random_state=42)
abnormal_data_train, abnormal_data_val = train_test_split(abnormal_data_train, test_size=0.2, random_state=42)

data_test=x_test.drop(['Label'], axis=1)

KeyboardInterrupt: ignored

# Common Functions


In [None]:
def get_prediction_labels(prediction_probabilities):
  predicted_labels = np.argmax(prediction_probabilities, axis=1)
  return predicted_labels

# **Unsupervised Models**

## AutoEncoder

In [None]:
input_dim = normal_data_train.shape[1]

# input layer
inp = Input(shape=(input_dim,)) # layer 1
# encoding layers
enc = Dense(64, activation='tanh')(inp) # layer 2
#enc = Dropout(0.5)(enc)
enc = Dense(32, activation='tanh')(enc) # layer 3
#enc = Dropout(0.5)(enc)
enc = Dense(16, activation='tanh')(enc) # layer 4
#enc = Dropout(0.5)(enc)

# bottleneck layer
mid = Dense(8, activation='tanh')(enc) # layer 5
#mid = Dropout(0.5)(mid)

# decoding layers
dec = Dense(16, activation='tanh')(mid) # layer 6
#dec = Dropout(0.5)(dec)
dec = Dense(32, activation='tanh')(dec) # layer 7
#dec = Dropout(0.5)(dec)
dec = Dense(64, activation='tanh')(dec) # layer 8
#dec = Dropout(0.5)(dec)

# output layer
out = Dense(input_dim, activation='sigmoid')(dec)

 # create new model
autoencoder = Model(inp, out)
autoencoder.compile(loss='binary_crossentropy', optimizer='adam')

autoencoder.summary()

Model: "model_59"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_62 (InputLayer)       [(None, 62)]              0         
                                                                 
 dense_849 (Dense)           (None, 64)                4032      
                                                                 
 dense_850 (Dense)           (None, 32)                2080      
                                                                 
 dense_851 (Dense)           (None, 16)                528       
                                                                 
 dense_852 (Dense)           (None, 8)                 136       
                                                                 
 dense_853 (Dense)           (None, 16)                144       
                                                                 
 dense_854 (Dense)           (None, 32)                544

In [None]:
tracemalloc.start()
start_time = time.time()
autoencoder.fit(normal_data_train, normal_data_train, epochs=100, batch_size=512, validation_data=(normal_data_val, normal_data_val), callbacks=[early_stopping])
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: early stopping
Training Time taken: 72.11599588394165 seconds
Training Memory Usage: 286.28945541381836 MB


In [None]:
#AUTOENCODER PREDICTION
tracemalloc.start()
start_time = time.time()
reconstructed = autoencoder.predict(data_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
mse = keras.metrics.mean_squared_error(data_test, reconstructed)

Prediction Time taken: 22.890613079071045 seconds
Prediction Memory Usage: 57.30162715911865 MB


In [None]:
print("Getting abnormal_data_train prediction...")
re = autoencoder.predict(abnormal_data_train)
train_abnormal_re = keras.metrics.mean_squared_error(abnormal_data_train, re)

print("Getting normal_data_train prediciton...")
re = autoencoder.predict(normal_data_train)
train_normal_re = keras.metrics.mean_squared_error(normal_data_train, re)

alpha = 0.5
threshold = np.concatenate([train_normal_re, train_abnormal_re]).mean() * alpha
print("Threshold:",threshold)
anomalies = np.where(mse <= threshold, 0, 1)

Getting abnormal_data_train prediction...
Getting normal_data_train prediciton...
Threshold: 0.00032919025397859514


In [None]:
y_test = x_test['Label']
accuracy = accuracy_score(y_test, anomalies)
report = classification_report(y_test, anomalies, digits=10)
auc = roc_auc_score(y_test, anomalies)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Accuracy: 0.6846713107271217
AUC 0.7364962503882749
              precision    recall  f1-score   support

           0  0.5566166843 0.9807816773 0.7101860636     95794
           1  0.9752503865 0.4922108234 0.6542303027    147384

    accuracy                      0.6846713107    243178
   macro avg  0.7659335354 0.7364962504 0.6822081831    243178
weighted avg  0.8103399223 0.6846713107 0.6762727003    243178



## Isolation Forest

In [None]:

iso_forest = IsolationForest(n_estimators=100, contamination=.4811769854896812, random_state=42)
iso_forest_x_train = x_train.drop(['attack'], axis=1)
#use normal_data_train to train only on normal data, use iso_forest_x_train to train on both malicious and normal
tracemalloc.start()
start_time = time.time()
iso_forest.fit(normal_data_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Training Time taken: 2.3989884853363037 seconds
Training Memory Usage: 68.16378974914551 MB


In [None]:
y_test = x_test['attack']
iso_x_test = x_test.drop(['attack'], axis=1)
tracemalloc.start()
start_time = time.time()
iso_forest_predictions = iso_forest.predict(iso_x_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

scores = iso_forest.decision_function(data_test)
# Replace -1 with 0

#iso_forest_predictions[iso_forest_predictions == -1] = 0

#use iso_forest_mapped_predictions when trained on only normal data
iso_forest_mapped_predictions = np.where(iso_forest_predictions == 1, 0, 1)

accuracy = accuracy_score(y_test, iso_forest_mapped_predictions)
report = classification_report(y_test, iso_forest_mapped_predictions, digits=10)
auc = roc_auc_score(y_test, iso_forest_mapped_predictions)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 7.299888610839844 seconds
Prediction Memory Usage: 50.30977725982666 MB
Accuracy: 0.749082584250749
AUC 0.757146898109124
              precision    recall  f1-score   support

           0  0.9894723805 0.5202187927 0.6819171183     15357
           1  0.6593462481 0.9940750035 0.7928283530     14346

    accuracy                      0.7490825843     29703
   macro avg  0.8244093143 0.7571468981 0.7373727357     29703
weighted avg  0.8300275603 0.7490825843 0.7354851947     29703



# **SUPERVISED MODELS**



## Decision Tree Classifier

In [None]:
dtc = DecisionTreeClassifier()
tracemalloc.start()
start_time = time.time()
dtc.fit(x_supervised_train, y_supervised_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Training Time taken: 15.217021465301514 seconds
Training Memory Usage: 282.9171848297119 MB


In [None]:
tracemalloc.start()
start_time = time.time()
dtc_predictions = dtc.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
#Assuming 'y_supervised_test' contains the actual labels (0 for normal, 1 for anomalous)
accuracy = accuracy_score(y_supervised_test, dtc_predictions)
report = classification_report(y_supervised_test, dtc_predictions, digits=10)
auc = roc_auc_score(y_supervised_test, dtc_predictions)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 0.050905704498291016 seconds
Prediction Memory Usage: 63.08040428161621 MB
Accuracy: 0.9996134518747584
AUC 0.9996025426320811
              precision    recall  f1-score   support

           0  0.9994676520 0.9995511201 0.9995093843     95794
           1  0.9997082293 0.9996539652 0.9996810965    147384

    accuracy                      0.9996134519    243178
   macro avg  0.9995879407 0.9996025426 0.9995952404    243178
weighted avg  0.9996134598 0.9996134519 0.9996134547    243178



## RANDOM FOREST

In [None]:
rf = RandomForestClassifier()
tracemalloc.start()
start_time = time.time()
rf.fit(x_supervised_train, y_supervised_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Training Time taken: 12.381677865982056 seconds
Training Memory Usage: 164.40681552886963 MB


In [None]:
tracemalloc.start()
start_time = time.time()
rf_predictions = rf.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
#Assuming 'y_supervised_test' contains the actual labels (0 for normal, 1 for anomalous)
accuracy = accuracy_score(y_supervised_test, rf_predictions)
report = classification_report(y_supervised_test, rf_predictions, digits=10)
auc = roc_auc_score(y_supervised_test, rf_predictions)
precision = precision_score(y_supervised_test, rf_predictions)
recall = recall_score(y_supervised_test, rf_predictions)
f1 = f1_score(y_supervised_test, rf_predictions)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("AUC", auc)
print(report)

Prediction Time taken: 2.1657841205596924 seconds
Prediction Memory Usage: 65.23192405700684 MB
Accuracy: 0.9996216763029551
Precision: 0.999721796246285
Recall: 0.9996539651522554
F1: 0.9996878795486467
AUC 0.9996129816992461
              precision    recall  f1-score   support

           0  0.9994676631 0.9995719982 0.9995198280     95794
           1  0.9997217962 0.9996539652 0.9996878795    147384

    accuracy                      0.9996216763    243178
   macro avg  0.9995947297 0.9996129817 0.9996038538    243178
weighted avg  0.9996216868 0.9996216763 0.9996216798    243178



## Logistic Regression Classifier

In [None]:
lr_model = LogisticRegression()
tracemalloc.start()
start_time = time.time()
lr_model.fit(x_supervised_train, y_supervised_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Training Time taken: 13.923728942871094 seconds
Training Memory Usage: 23.301979064941406 MB


In [None]:
tracemalloc.start()
start_time = time.time()
#lr_model_pred = lr_model.predict(x_supervised_test)
lr_pred_probs = lr_model.predict_proba(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

lr_prediction_labels = get_prediction_labels(lr_pred_probs)


accuracy = accuracy_score(y_supervised_test, lr_prediction_labels)
report = classification_report(y_supervised_test, lr_prediction_labels, digits=10)
auc = roc_auc_score(y_supervised_test, lr_prediction_labels)
print("Accuracy:", accuracy)
print("AUC", auc)

print(report)

Prediction Time taken: 0.022423505783081055 seconds
Prediction Memory Usage: 3.711183547973633 MB
Accuracy: 0.9887901043679939
AUC 0.9878032144577603
              precision    recall  f1-score   support

           0  0.9883306049 0.9831513456 0.9857341720     95794
           1  0.9890861880 0.9924550833 0.9907677719    147384

    accuracy                      0.9887901044    243178
   macro avg  0.9887083964 0.9878032145 0.9882509720    243178
weighted avg  0.9887885446 0.9887901044 0.9887849089    243178



In [None]:
import warnings
def get_lr_low_confidence(prediction_probabilities, data, labels, threshold):
  # Get the max probability (confidence score)
  confidence_scores = np.max(prediction_probabilities, axis=1)
  data_columns = data.columns
  low_confidence = pd.DataFrame(columns=data_columns)
  low_confidence_labels = np.array([])
  warnings.filterwarnings("ignore")
  for i, score in enumerate(confidence_scores):
    if score < threshold:
      # Handle predictions with low confidence here
      # For example, you might want to flag them, exclude them, or set them to a default value
      #print(f"Prediction {i} has low confidence: {score:.5f}")
      low_confidence = low_confidence.append(data.iloc[i])
      low_confidence_labels = np.append(low_confidence_labels, labels.iloc[i])
  #print('Number of Low Confidence Predictions:', len(low_confidence))
  return low_confidence, low_confidence_labels

In [None]:
def get_lr_low_confidence(prediction_probabilities, data, labels, threshold):
  # Get the max probability (confidence score)
  confidence_scores = np.max(prediction_probabilities, axis=1)

  # Create a mask for low confidence scores
  low_confidence_mask = confidence_scores < threshold

  # Filter data and labels using the mask
  low_confidence_data = data[low_confidence_mask]
  low_confidence_labels = data[low_confidence_mask]
  low_confidence_indexes = data.index[low_confidence_mask]

  high_confidence_probs = prediction_probabilities[~low_confidence_mask]
  high_confidence_indexes = data.index[~low_confidence_mask]

  return high_confidence_probs,high_confidence_indexes, low_confidence_data, low_confidence_labels, low_confidence_indexes

In [None]:
high_probs, high_prob_indexes, lr_low_confidence, low_confidence_lables, low_confidence_indexes = get_lr_low_confidence(lr_pred_probs, x_supervised_test, y_supervised_test, 0.90)

In [None]:
high_prediction_labels = pd.DataFrame(get_prediction_labels(high_probs), index=high_prob_indexes, columns=['attack'])

## Feedforward Neural Network

In [None]:
input_dim = x_supervised_train.shape[1]
mlp_model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
mlp_model.summary()
mlp_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               31232     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [None]:
tracemalloc.start()
start_time = time.time()
mlp_model.fit(x_supervised_train, y_supervised_train, epochs=100, callbacks=[early_stopping], batch_size=512, validation_data=(x_supervised_val, y_supervised_val))
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 53: early stopping
Training Time taken: 78.56627869606018 seconds
Training Memory Usage: 215.42763900756836 MB


In [None]:
mlp_pred_probs = mlp_model.predict(lr_low_confidence)



In [None]:
mlp_pred = pd.DataFrame(np.where(mlp_pred_probs > 0.5, 1, 0).flatten(), index=low_confidence_indexes, columns=['attack'])

In [None]:
improved_prediction = pd.concat([mlp_pred, high_prediction_labels], axis=0, ignore_index=False)

In [None]:
y_supervised_df = pd.DataFrame(y_supervised_test, columns=['attack'])

In [None]:
y_supervised_df

Unnamed: 0,attack
27060,0
139613,1
124005,0
60488,0
77672,0
...,...
116218,0
21934,0
144637,1
10407,1


In [None]:
improved_pred_reordered = improved_prediction.reindex(y_supervised_df.index)

In [None]:
improved_pred_reordered

Unnamed: 0,attack
27060,0
139613,1
124005,0
60488,0
77672,0
...,...
116218,0
21934,0
144637,1
10407,1


In [None]:
mlp_pred = np.where(mlp_pred_probs > 0.5, 1, 0).flatten()  # Flatten to convert from 2D to 1D
accuracy = accuracy_score(y_supervised_test, improved_pred_reordered)
report = classification_report(y_supervised_test, improved_pred_reordered, digits=10)
auc = roc_auc_score(y_supervised_test, improved_pred_reordered)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Accuracy: 0.9802713530619803
AUC 0.980209464487291
              precision    recall  f1-score   support

           0  0.9798583588 0.9820277398 0.9809418499     15357
           1  0.9807154835 0.9783911892 0.9795519576     14346

    accuracy                      0.9802713531     29703
   macro avg  0.9802869211 0.9802094645 0.9802469037     29703
weighted avg  0.9802723342 0.9802713531 0.9802705576     29703



In [None]:
tracemalloc.start()
start_time = time.time()
mlp_pred_probs = mlp_model.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
mlp_pred = np.where(mlp_pred_probs > 0.5, 1, 0).flatten()  # Flatten to convert from 2D to 1D
accuracy = accuracy_score(y_supervised_test, mlp_pred)
report = classification_report(y_supervised_test, mlp_pred, digits=10)
auc = roc_auc_score(y_supervised_test, mlp_pred_probs)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 22.577648162841797 seconds
Prediction Memory Usage: 113.9242296218872 MB
Accuracy: 0.9955999309148031
AUC 0.9999692643049198
              precision    recall  f1-score   support

           0  0.9992621015 0.9895609328 0.9943878568     95794
           1  0.9932575482 0.9995250502 0.9963814432    147384

    accuracy                      0.9955999309    243178
   macro avg  0.9962598249 0.9945429915 0.9953846500    243178
weighted avg  0.9956228945 0.9955999309 0.9955961188    243178



## RNN

In [None]:
y_supervised_rnn = processed_dataset[['Label']].values
x_supervised_rnn = processed_dataset.drop(['Label'], axis=1)
x_supervised_rnn_train, x_supervised_rnn_test, y_supervised_rnn_train, y_supervised_rnn_test = train_test_split(x_supervised_rnn,y_supervised_rnn, test_size=0.2, random_state=42)
x_supervised_rnn_val, x_supervised_rnn_val, y_supervised_rnn_val, y_supervised_rnn_val = train_test_split(x_supervised_rnn_train,y_supervised_rnn_train, test_size=0.2, random_state=42)

x_supervised_train_numpy = x_supervised_rnn_train.to_numpy()
x_supervised_test_numpy = x_supervised_rnn_test.to_numpy()
x_supervised_val_numpy = x_supervised_rnn_val.to_numpy()


x_supervised_train_rnn = np.reshape(x_supervised_train_numpy, (x_supervised_train_numpy.shape[0],x_supervised_train_numpy.shape[1],1))
x_supervised_test_rnn = np.reshape(x_supervised_test_numpy, (x_supervised_test_numpy.shape[0],x_supervised_test_numpy.shape[1],1))
x_supervised_val_rnn = np.reshape(x_supervised_val_numpy, (x_supervised_val_numpy.shape[0],x_supervised_val_numpy.shape[1],1))

In [None]:
rnn = Sequential()

rnn.add(SimpleRNN(units=128, return_sequences=True, input_shape=(x_supervised_train_rnn.shape[1],1)))
rnn.add(Dropout(0.2))
rnn.add(SimpleRNN(units=64, return_sequences=True))
rnn.add(Dropout(0.2))
# output layer with sigmoid activation
rnn.add(tf.keras.layers.GlobalAveragePooling1D())
rnn.add(Dense(1, activation='sigmoid'))

# defining loss function, optimizer, metrics and then compiling model
rnn.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics=['accuracy'])
# summary of model layers
rnn.summary()

Model: "sequential_76"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_6 (SimpleRNN)    (None, 62, 128)           16640     
                                                                 
 dropout_282 (Dropout)       (None, 62, 128)           0         
                                                                 
 simple_rnn_7 (SimpleRNN)    (None, 62, 64)            12352     
                                                                 
 dropout_283 (Dropout)       (None, 62, 64)            0         
                                                                 
 global_average_pooling1d_1  (None, 64)                0         
 9 (GlobalAveragePooling1D)                                      
                                                                 
 dense_863 (Dense)           (None, 1)                 65        
                                                     

In [None]:
tracemalloc.start()
start_time = time.time()
rnn.fit(x_supervised_train_rnn, y_supervised_rnn_train, epochs=100, batch_size=512,validation_data=(x_supervised_rnn_val, y_supervised_rnn_val), callbacks=[early_stopping])
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 31: early stopping
Training Time taken: 4226.838588953018 seconds
Training Memory Usage: 684.9544305801392 MB


In [None]:
tracemalloc.start()
start_time = time.time()
rnn_prediction_probs = rnn.predict(x_supervised_test_rnn)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
rnn_pred = np.where(rnn_prediction_probs > 0.5, 1, 0).flatten()
accuracy = accuracy_score(y_supervised_rnn_test, rnn_pred)
report = classification_report(y_supervised_rnn_test, rnn_pred, digits=10)
auc = roc_auc_score(y_supervised_rnn_test, rnn_prediction_probs)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 87.01072359085083 seconds
Prediction Memory Usage: 170.9435977935791 MB
Accuracy: 0.9994612999531207
AUC 0.9999801389352636
              precision    recall  f1-score   support

           0  0.9990922086 0.9995406810 0.9993163945     95794
           1  0.9997013730 0.9994097053 0.9995555179    147384

    accuracy                      0.9994613000    243178
   macro avg  0.9993967908 0.9994751932 0.9994359562    243178
weighted avg  0.9994614077 0.9994613000 0.9994613211    243178



## Long Short-Term Memory

In [None]:
y_supervised_lstm = data[['Label']].values
x_supervised_lstm = data.drop(['Label'], axis=1)
x_supervised_lstm_train, x_supervised_lstm_test, y_supervised_lstm_train, y_supervised_lstm_test = train_test_split(x_supervised_lstm,y_supervised_lstm, test_size=0.2, random_state=42)
x_supervised_lstm_val, x_supervised_lstm_val, y_supervised_lstm_val, y_supervised_lstm_val = train_test_split(x_supervised_lstm_train,y_supervised_lstm_train, test_size=0.2, random_state=42)

x_supervised_train_numpy = x_supervised_lstm_train.to_numpy()
x_supervised_test_numpy = x_supervised_lstm_test.to_numpy()
x_supervised_val_numpy = x_supervised_lstm_val.to_numpy()

x_supervised_train_lstm = np.reshape(x_supervised_train_numpy, (x_supervised_train_numpy.shape[0],x_supervised_train_numpy.shape[1],1))
x_supervised_test_lstm = np.reshape(x_supervised_test_numpy, (x_supervised_test_numpy.shape[0],x_supervised_test_numpy.shape[1],1))
x_supervised_val_lstm = np.reshape(x_supervised_val_numpy, (x_supervised_val_numpy.shape[0],x_supervised_val_numpy.shape[1],1))

In [None]:
lstm = Sequential() # initializing model

# input layer and LSTM layer
lstm.add(LSTM(units=128, return_sequences=True, input_shape=(x_supervised_train_lstm.shape[1],1)))
lstm.add(Dropout(0.2))
lstm.add(LSTM(units=64, return_sequences=True))
lstm.add(Dropout(0.2))
#lstm.add(LSTM(units=64, return_sequences=True))
#lstm.add(Dropout(0.2))
# output layer with sigmoid activation
lstm.add(tf.keras.layers.GlobalAveragePooling1D())
lstm.add(Dense(1, activation='sigmoid'))

# defining loss function, optimizer, metrics and then compiling model
lstm.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics=['accuracy'])
# summary of model layers
lstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_2 (LSTM)               (None, 77, 128)           66560     
                                                                 
 dropout_7 (Dropout)         (None, 77, 128)           0         
                                                                 
 lstm_3 (LSTM)               (None, 77, 64)            49408     
                                                                 
 dropout_8 (Dropout)         (None, 77, 64)            0         
                                                                 
 global_average_pooling1d_1  (None, 64)                0         
  (GlobalAveragePooling1D)                                       
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                      

In [None]:
tracemalloc.start()
start_time = time.time()
lstm.fit(x_supervised_train_lstm, y_supervised_lstm_train, epochs=100, batch_size=512,validation_data=(x_supervised_val_lstm, y_supervised_lstm_val), callbacks=[early_stopping])
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 36: early stopping
Training Time taken: 2435.0695197582245 seconds
Training Memory Usage: 1843.286033630371 MB


In [None]:
tracemalloc.start()
start_time = time.time()
lstm_prediction_probs = lstm.predict(x_supervised_test_lstm)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
lstm_pred = np.where(lstm_prediction_probs > 0.5, 1, 0).flatten()
accuracy = accuracy_score(y_supervised_lstm_test, lstm_pred)
report = classification_report(y_supervised_lstm_test, lstm_pred, digits=10)
auc = roc_auc_score(y_supervised_lstm_test, lstm_prediction_probs)
precision = precision_score(y_supervised_test, lstm_pred)
recall = recall_score(y_supervised_test, lstm_pred)
f1 = f1_score(y_supervised_test, lstm_pred)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("AUC", auc)
print(report)

Prediction Time taken: 97.99469375610352 seconds
Prediction Memory Usage: 452.495059967041 MB
Accuracy: 0.9621430616429012
Precision: 0.8690518984497866
Recall: 0.9058050768077932
F1: 0.8870479510170386
AUC 0.9914324882713359
              precision    recall  f1-score   support

         0.0  0.9813519759 0.9732038389 0.9772609235    435025
         1.0  0.8690518984 0.9058050768 0.8870479510     85408

    accuracy                      0.9621430616    520433
   macro avg  0.9252019372 0.9395044578 0.9321544372    520433
weighted avg  0.9629224663 0.9621430616 0.9624561176    520433



In [None]:
auc = roc_auc_score(y_supervised_lstm_test, lstm_pred)
auc

0.9395044578338144

## Gated Reccurent Unit Model

In [None]:
# Define the model
gru = Sequential()
gru.add(GRU(units=128, return_sequences=True, input_shape=(x_supervised_train_lstm.shape[1],1)))
gru.add(Dropout(0.2))
gru.add(GRU(64, return_sequences=True))  # Add another LSTM layer
gru.add(Dropout(0.2))
gru.add(tf.keras.layers.GlobalAveragePooling1D())
gru.add(Dense(1, activation='sigmoid'))

# defining loss function, optimizer, metrics and then compiling model
gru.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics=['accuracy'])
# summary of model layers
gru.summary()

Model: "sequential_79"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru_8 (GRU)                 (None, 62, 128)           50304     
                                                                 
 dropout_288 (Dropout)       (None, 62, 128)           0         
                                                                 
 gru_9 (GRU)                 (None, 62, 64)            37248     
                                                                 
 dropout_289 (Dropout)       (None, 62, 64)            0         
                                                                 
 global_average_pooling1d_2  (None, 64)                0         
 2 (GlobalAveragePooling1D)                                      
                                                                 
 dense_866 (Dense)           (None, 1)                 65        
                                                     

In [None]:
tracemalloc.start()
start_time = time.time()
gru.fit(x_supervised_train_lstm, y_supervised_lstm_train, epochs=100, batch_size=512,validation_data=(x_supervised_val_lstm, y_supervised_lstm_val), callbacks=[early_stopping])
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 70: early stopping
Training Time taken: 1841.6199524402618 seconds
Training Memory Usage: 681

In [None]:
tracemalloc.start()
start_time = time.time()
gru_prediction_probs = gru.predict(x_supervised_test_lstm)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
gru_pred = np.where(gru_prediction_probs > 0.5, 1, 0).flatten()
accuracy = accuracy_score(y_supervised_lstm_test, gru_pred)
report = classification_report(y_supervised_lstm_test, gru_pred, digits=10)
auc = roc_auc_score(y_supervised_lstm_test, gru_prediction_probs)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 33.73401856422424 seconds
Prediction Memory Usage: 170.29786205291748 MB
Accuracy: 0.9964799447318425
AUC 0.9991065329239991
              precision    recall  f1-score   support

           0  0.9967766917 0.9942793912 0.9955264753     95794
           1  0.9962878665 0.9979102209 0.9970983838    147384

    accuracy                      0.9964799447    243178
   macro avg  0.9965322791 0.9960948061 0.9963124295    243178
weighted avg  0.9964804272 0.9964799447 0.9964791690    243178



## RNN+CNN



In [None]:

input_dim = x_supervised_train.shape[1]

In [None]:
# Define the RecurrentConvolutionLayer
class RecurrentConvolutionLayer(layers.Layer):
    def __init__(self, filters, kernel_size, activation='tanh', **kwargs):
        super(RecurrentConvolutionLayer, self).__init__(**kwargs)
        self.filters = filters
        self.kernel_size = kernel_size
        self.activation = activation

    def build(self, input_shape):
        self.conv = layers.Conv1D(self.filters, self.kernel_size, activation=self.activation, padding='same')
        self.rnn = layers.SimpleRNN(self.filters, activation=self.activation, return_sequences=True)

    def call(self, inputs):
        x = self.conv(inputs)
        x = self.rnn(x)
        return x

    def get_config(self):
        config = super(RecurrentConvolutionLayer, self).get_config()
        config.update({'filters': self.filters, 'kernel_size': self.kernel_size, 'activation': self.activation})
        return config

leaky_relu = LeakyReLU(alpha=0.2)

inputs = Input(shape=(input_dim, 1))
x = Conv1D(filters=32, kernel_size=11, activation=leaky_relu, padding='same')(inputs)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = RecurrentConvolutionLayer(filters=16, kernel_size=5, activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = GRU(32, activation=leaky_relu, return_sequences=True)(x)

x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Flatten()(x)
x = Dense(units=32, activation=leaky_relu)(x)
x = BatchNormalization()(x)
x = Dropout(0.2)(x)
x = Dense(units=1)(x)
outputs = (x)

RCNN = Model(inputs=inputs, outputs=outputs)
RCNN.summary()
RCNN.compile(loss='binary_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),metrics=['accuracy'])




Model: "model_60"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_63 (InputLayer)       [(None, 62, 1)]           0         
                                                                 
 conv1d_4 (Conv1D)           (None, 62, 32)            384       
                                                                 
 batch_normalization_16 (Ba  (None, 62, 32)            128       
 tchNormalization)                                               
                                                                 
 dropout_290 (Dropout)       (None, 62, 32)            0         
                                                                 
 recurrent_convolution_laye  (None, 62, 16)            3104      
 r_4 (RecurrentConvolutionL                                      
 ayer)                                                           
                                                          

In [None]:
tracemalloc.start()
start_time = time.time()
RCNN.fit(x_supervised_train, y_supervised_train, epochs=100, batch_size=512,validation_data=(x_supervised_val, y_supervised_val), callbacks=[early_stopping])
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 31: early stopping
Training Time taken: 8572.776195049286 seconds
Training Memory Usage: 457.6453857421875 MB


In [None]:
tracemalloc.start()
start_time = time.time()
rcnn_prediction_probs = RCNN.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
rcnn_pred = np.where(rcnn_prediction_probs > 0.5, 1, 0).flatten()
accuracy = accuracy_score(y_supervised_test, rcnn_pred)
report = classification_report(y_supervised_test, rcnn_pred, digits=10)
auc = roc_auc_score(y_supervised_test, rcnn_prediction_probs)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 130.6854100227356 seconds
Prediction Memory Usage: 113.46459865570068 MB
Accuracy: 0.9914424824614069
AUC 0.9970324397134294
              precision    recall  f1-score   support

           0  0.9955475649 0.9826711485 0.9890674498     95794
           1  0.9888308001 0.9971435163 0.9929697609    147384

    accuracy                      0.9914424825    243178
   macro avg  0.9921891825 0.9899073324 0.9910186053    243178
weighted avg  0.9914767046 0.9914424825 0.9914325413    243178



## Combined LR and FNN

### TRAIN FNN

In [None]:
#Train FNN
input_dim = x_supervised_train.shape[1]
fnn_model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
fnn_model.summary()
fnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               31232     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [None]:
tracemalloc.start()
start_time = time.time()
fnn_model.fit(x_supervised_train, y_supervised_train, epochs=100, callbacks=[early_stopping], batch_size=512, validation_data=(x_supervised_val, y_supervised_val))
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 26: early stopping
Training Time taken: 45.89302086830139 seconds
Training Memory Usage: 215.62742137908936 MB


In [None]:
tracemalloc.start()
start_time = time.time()
fnn_pred_probs = fnn_model.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
fnn_pred = np.where(fnn_pred_probs > 0.5, 1, 0).flatten()  # Flatten to convert from 2D to 1D
accuracy = accuracy_score(y_supervised_test, fnn_pred)
report = classification_report(y_supervised_test, fnn_pred, digits=10)
auc = roc_auc_score(y_supervised_test, fnn_pred)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

Prediction Time taken: 13.926042556762695 seconds
Prediction Memory Usage: 54.03087615966797 MB
Accuracy: 0.9905733427599905
AUC 0.9906679551694079
              precision    recall  f1-score   support

           0  0.9938421225 0.9878882594 0.9908562471     15357
           1  0.9871173293 0.9934476509 0.9902723735     14346

    accuracy                      0.9905733428     29703
   macro avg  0.9904797259 0.9906679552 0.9905643103     29703
weighted avg  0.9905941717 0.9905733428 0.9905742470     29703



In [None]:
precision = precision_score(y_supervised_test, fnn_pred)
recall = recall_score(y_supervised_test, fnn_pred)
f1 = f1_score(y_supervised_test, fnn_pred)

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)


Precision: 0.987117329269982
Recall: 0.9934476509131466
F1: 0.9902723735408561


In [None]:
auc = roc_auc_score(y_supervised_test, fnn_pred)
auc

0.9906679551694079

### TRAIN LOGISTIC REGRESSION

In [None]:
x_supervised_test

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,logged_in,is_guest_login,is_host_login,land,root_shell,su_attempted
27060,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0,0,0,0,0,0
139613,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,1.00,1.00,0,0,0,0,0,0
124005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,1,0,0,0,0,0
60488,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,1,1,0,0,0,0
77672,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.12,1.00,0.88,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116218,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0,0,0,0,0,0
21934,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,1,0,0,0,0,0
144637,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.91,0.97,0,0,0,0,0,0
10407,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.00,0.00,0.00,0.00,0,0,0,0,0,0


In [None]:
lr_model = LogisticRegression()
tracemalloc.start()
start_time = time.time()
lr_model.fit(x_supervised_train, y_supervised_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"LR Training Time taken: {elapsed_time} seconds")
print("LR Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

tracemalloc.start()
start_time = time.time()
#lr_model_pred = lr_model.predict(x_supervised_test)
lr_pred_probs = lr_model.predict_proba(x_supervised_test)
end_time = time.time()
lr_pred_memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
lr_pred_elapsed_time = end_time - start_time
print(f"LR Prediction Time taken: {lr_pred_elapsed_time} seconds")
print("LR Prediction Memory Usage:",(lr_pred_memory_usage[1]-lr_pred_memory_usage[0])/(1024*1024), "MB")

lr_prediction_labels = get_prediction_labels(lr_pred_probs)


accuracy = accuracy_score(y_supervised_test, lr_prediction_labels)
report = classification_report(y_supervised_test, lr_prediction_labels, digits=10)
auc = roc_auc_score(y_supervised_test, lr_prediction_labels)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)

LR Training Time taken: 2.2539823055267334 seconds
LR Training Memory Usage: 107.77646255493164 MB
LR Prediction Time taken: 0.018840789794921875 seconds
LR Prediction Memory Usage: 27.42205238342285 MB
Accuracy: 0.9578830421169579
AUC 0.957484284457896
              precision    recall  f1-score   support

           0  0.9503256289 0.9691997135 0.9596698798     15357
           1  0.9663129407 0.9457688554 0.9559305316     14346

    accuracy                      0.9578830421     29703
   macro avg  0.9583192848 0.9574842845 0.9578002057     29703
weighted avg  0.9580472050 0.9578830421 0.9578638437     29703



In [None]:
precision = precision_score(y_supervised_test, lr_prediction_labels)
recall = recall_score(y_supervised_test, lr_prediction_labels)
f1 = f1_score(y_supervised_test, lr_prediction_labels)

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)


Precision: 0.9663129406737412
Recall: 0.945768855430085
F1: 0.9559305315813577


### Get logistic regression low confidence predictions

In [None]:
def get_lr_low_confidence(prediction_probabilities, data, labels, threshold):
  # Get the max probability (confidence score)
  confidence_scores = np.max(prediction_probabilities, axis=1)
  confidence_scores_rounded = np.round(confidence_scores, 2)
  print(confidence_scores_rounded)
  # Create a mask for low confidence scores
  low_confidence_mask = confidence_scores_rounded <= threshold

  # Filter data and labels using the mask
  low_confidence_data = data[low_confidence_mask]
  low_confidence_labels = data[low_confidence_mask]
  low_confidence_indexes = data.index[low_confidence_mask]

  high_confidence_probs = prediction_probabilities[~low_confidence_mask]
  high_confidence_indexes = data.index[~low_confidence_mask]

  return high_confidence_probs,high_confidence_indexes, low_confidence_data, low_confidence_labels, low_confidence_indexes

### Repredict low confidence on FNN

In [None]:
high_probs, high_prob_indexes, lr_low_confidence, low_confidence_lables, low_confidence_indexes = get_lr_low_confidence(lr_pred_probs, x_supervised_test, y_supervised_test, .99)
high_prediction_labels = pd.DataFrame(get_prediction_labels(high_probs), index=high_prob_indexes, columns=['attack'])
print('Num Low Confidence:',len(lr_low_confidence))
print('% of Total Test Set:', np.round(((len(lr_low_confidence) / len(x_supervised_test))*100), 3))
tracemalloc.start()
start_time = time.time()
fnn_low_pred_probs = fnn_model.predict(lr_low_confidence)
end_time = time.time()
fnn_low_memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
fnn_low_elapsed_time = end_time - start_time
print(f"Prediction Time taken: {fnn_low_elapsed_time} seconds")
print("Prediction Memory Usage:",(fnn_low_memory_usage[1]-fnn_low_memory_usage[0])/(1024*1024), "MB")

fnn_low_pred = pd.DataFrame(np.where(fnn_low_pred_probs > 0.5, 1, 0).flatten(), index=low_confidence_indexes, columns=['attack'])
y_supervised_df = pd.DataFrame(y_supervised_test, columns=['attack'])
#rejoin the repredicted with original prediction
improved_prediction = pd.concat([fnn_low_pred, high_prediction_labels], axis=0, ignore_index=False)
#Match indexes
improved_pred_reordered = improved_prediction.reindex(y_supervised_df.index)
accuracy = accuracy_score(y_supervised_test, improved_pred_reordered)
report = classification_report(y_supervised_test, improved_pred_reordered, digits=10)
auc = roc_auc_score(y_supervised_test, improved_pred_reordered)
precision = precision_score(y_supervised_test, improved_pred_reordered)
recall = recall_score(y_supervised_test, improved_pred_reordered)
f1 = f1_score(y_supervised_test, improved_pred_reordered)
auc = roc_auc_score(y_supervised_test, improved_pred_reordered)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("AUC:", auc)
#print(report)
total_elapsed_time =fnn_low_elapsed_time + lr_pred_elapsed_time
print('Total Elapsed Time:', total_elapsed_time)
total_mem_usage = fnn_low_memory_usage + lr_pred_memory_usage
print('Total Memory Usage:', ((fnn_low_memory_usage[1]-fnn_low_memory_usage[0])/(1024*1024)) + ((lr_pred_memory_usage[1]-lr_pred_memory_usage[0])/(1024*1024)))

[1.   1.   0.96 ... 0.94 0.67 0.99]
Num Low Confidence: 15317
% of Total Test Set: 51.567
Prediction Time taken: 1.5650382041931152 seconds
Prediction Memory Usage: 28.124802589416504 MB
Accuracy: 0.9903376763289904
Precision: 0.9867063629439867
Recall: 0.993377945071797
F1: 0.9900309145854319
AUC: 0.9904377515943084
Total Elapsed Time: 1.583878993988037
Total Memory Usage: 55.546854972839355


# DTC and FNN





## Train FNN

In [None]:
#Train FNN
input_dim = x_supervised_train.shape[1]
fnn_model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,)),
        Dropout(0.2),
        Dense(128, activation='relu'),
        Dropout(0.2),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(16, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
fnn_model.summary()
fnn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               16128     
                                                                 
 dropout (Dropout)           (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 32)                2

In [None]:
tracemalloc.start()
start_time = time.time()
fnn_model.fit(x_supervised_train, y_supervised_train, epochs=100, callbacks=[early_stopping], batch_size=512, validation_data=(x_supervised_val, y_supervised_val))
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Training Time taken: {elapsed_time} seconds")
print("Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: early stopping
Training Time taken: 111.85323190689087 seconds
Training Memory Usage: 462.99971199035645 MB


In [None]:
tracemalloc.start()
start_time = time.time()
fnn_pred_probs = fnn_model.predict(x_supervised_test)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"Prediction Time taken: {elapsed_time} seconds")
print("Prediction Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")
fnn_pred = np.where(fnn_pred_probs > 0.5, 1, 0).flatten()  # Flatten to convert from 2D to 1D
accuracy = accuracy_score(y_supervised_test, fnn_pred)
report = classification_report(y_supervised_test, fnn_pred, digits=10)
auc = roc_auc_score(y_supervised_test, fnn_pred_probs)
print("Accuracy:", accuracy)
print("AUC", auc)
precision = precision_score(y_supervised_test, fnn_pred)
recall = recall_score(y_supervised_test, fnn_pred)
f1 = f1_score(y_supervised_test, fnn_pred)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print(report)

Prediction Time taken: 25.30662178993225 seconds
Prediction Memory Usage: 114.02619647979736 MB
Accuracy: 0.9992926991750899
AUC 0.9999952314349123
Precision: 0.99991170757549
Recall: 0.9989211854746783
F1: 0.9994162010983566
              precision    recall  f1-score   support

           0  0.9983427142 0.9998642921 0.9991029238     95794
           1  0.9999117076 0.9989211855 0.9994162011    147384

    accuracy                      0.9992926992    243178
   macro avg  0.9991272109 0.9993927388 0.9992595625    243178
weighted avg  0.9992936412 0.9992926992 0.9992927932    243178



## Train DTC

In [None]:
dtc_model = DecisionTreeClassifier()
tracemalloc.start()
start_time = time.time()
dtc_model.fit(x_supervised_train, y_supervised_train)
end_time = time.time()
memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
elapsed_time = end_time - start_time
print(f"LR Training Time taken: {elapsed_time} seconds")
print("LR Training Memory Usage:",(memory_usage[1]-memory_usage[0])/(1024*1024), "MB")

tracemalloc.start()
start_time = time.time()
#lr_model_pred = lr_model.predict(x_supervised_test)
dtc_pred_probs = dtc_model.predict_proba(x_supervised_test)
end_time = time.time()
dtc_pred_memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
dtc_pred_elapsed_time = end_time - start_time
print(f"LR Prediction Time taken: {dtc_pred_elapsed_time} seconds")
print("LR Prediction Memory Usage:",(dtc_pred_memory_usage[1]-dtc_pred_memory_usage[0])/(1024*1024), "MB")

dtc_prediction_labels = get_prediction_labels(dtc_pred_probs)


accuracy = accuracy_score(y_supervised_test, dtc_prediction_labels)
report = classification_report(y_supervised_test, dtc_prediction_labels, digits=10)
auc = roc_auc_score(y_supervised_test, dtc_prediction_labels)
print("Accuracy:", accuracy)
print("AUC", auc)
print(report)


LR Training Time taken: 168.9399516582489 seconds
LR Training Memory Usage: 1032.061824798584 MB
LR Prediction Time taken: 0.1530470848083496 seconds
LR Prediction Memory Usage: 157.33512783050537 MB
Accuracy: 0.9988355849840421
AUC 0.9984283781520633
              precision    recall  f1-score   support

         0.0  0.9995722083 0.9990345382 0.9993033009    435025
         1.0  0.9950958642 0.9978222181 0.9964571763     85408

    accuracy                      0.9988355850    520433
   macro avg  0.9973340362 0.9984283782 0.9978802386    520433
weighted avg  0.9988375977 0.9988355850 0.9988362248    520433



In [None]:
precision = precision_score(y_supervised_test, dtc_prediction_labels)
recall = recall_score(y_supervised_test, dtc_prediction_labels)
f1 = f1_score(y_supervised_test, dtc_prediction_labels)

print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Precision: 0.9950958641787908
Recall: 0.9978222180591982
F1: 0.9964571762642502


## Get DTC low Confidence

In [None]:
def get_dtc_low_confidence(prediction_probabilities, data, labels, threshold):
  # Get the max probability (confidence score)
  confidence_scores = np.max(prediction_probabilities, axis=1)
  #confidence_scores_rounded = np.round(confidence_scores, 5)
  print(confidence_scores)
  # Create a mask for low confidence scores
  low_confidence_mask = confidence_scores <= threshold

  # Filter data and labels using the mask
  low_confidence_data = data[low_confidence_mask]
  low_confidence_labels = data[low_confidence_mask]
  low_confidence_indexes = data.index[low_confidence_mask]

  high_confidence_probs = prediction_probabilities[~low_confidence_mask]
  high_confidence_indexes = data.index[~low_confidence_mask]

  return high_confidence_probs,high_confidence_indexes, low_confidence_data, low_confidence_labels, low_confidence_indexes

## Repredict DTC low confidence

In [None]:
high_probs, high_prob_indexes, dtc_low_confidence, low_confidence_lables, low_confidence_indexes = get_dtc_low_confidence(dtc_pred_probs, x_supervised_test, y_supervised_test, .99)
high_prediction_labels = pd.DataFrame(get_prediction_labels(high_probs), index=high_prob_indexes, columns=['attack'])
print('Num Low Confidence:',len(dtc_low_confidence))
print('% of Total Test Set:', np.round(((len(dtc_low_confidence) / len(x_supervised_test))*100), 3))
tracemalloc.start()
start_time = time.time()
fnn_low_pred_probs = fnn_model.predict(dtc_low_confidence)
end_time = time.time()
fnn_low_memory_usage = tracemalloc.get_traced_memory()
tracemalloc.stop()
fnn_low_elapsed_time = end_time - start_time
print(f"Prediction Time taken: {fnn_low_elapsed_time} seconds")
print("Prediction Memory Usage:",(fnn_low_memory_usage[1]-fnn_low_memory_usage[0])/(1024*1024), "MB")

fnn_low_pred = pd.DataFrame(np.where(fnn_low_pred_probs > 0.5, 1, 0).flatten(), index=low_confidence_indexes, columns=['attack'])
y_supervised_df = pd.DataFrame(y_supervised_test, columns=['attack'])
#rejoin the repredicted with original prediction
improved_prediction = pd.concat([fnn_low_pred, high_prediction_labels], axis=0, ignore_index=False)
#Match indexes
improved_pred_reordered = improved_prediction.reindex(y_supervised_df.index)
accuracy = accuracy_score(y_supervised_test, improved_pred_reordered)
report = classification_report(y_supervised_test, improved_pred_reordered, digits=10)
auc = roc_auc_score(y_supervised_test, improved_pred_reordered)
precision = precision_score(y_supervised_test, improved_pred_reordered)
recall = recall_score(y_supervised_test, improved_pred_reordered)
f1 = f1_score(y_supervised_test, improved_pred_reordered)
auc = roc_auc_score(y_supervised_test, improved_pred_reordered)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)
print("AUC:", auc)
#print(report)
total_elapsed_time =fnn_low_elapsed_time + lr_pred_elapsed_time
print('Total Elapsed Time:', total_elapsed_time)
total_mem_usage = fnn_low_memory_usage + lr_pred_memory_usage
print('Total Memory Usage:', ((fnn_low_memory_usage[1]-fnn_low_memory_usage[0])/(1024*1024)) + ((lr_pred_memory_usage[1]-lr_pred_memory_usage[0])/(1024*1024)))

[1. 1. 1. ... 1. 1. 1.]
Num Low Confidence: 6546
% of Total Test Set: 1.258


InvalidArgumentError: ignored