In [None]:
import pandas as pd

# load csv file with semicolon separator
df = pd.read_csv("Ecodis_Traffic_normal-attack.csv", sep=";")

# replace all instances of semicolon with comma
df.replace(";", ",", inplace=True)

# save the new csv file with comma separator
df.to_csv("goose_data_new.csv", index=False)


In [None]:
import pandas as pd

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Count the number of instances in each class
class_counts = goose_df['label'].value_counts()

# Print the class counts
print("Class counts:")
print(class_counts)


In [52]:
#RFC without any sampling

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict the test set
y_pred = model.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score
cr = classification_report(y_test, y_pred)
print("\nClassification report:\n", cr)


Computational time: 1.044710636138916 seconds

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.54      0.70       994
           1       0.94      1.00      0.97      7243

    accuracy                           0.94      8237
   macro avg       0.97      0.77      0.83      8237
weighted avg       0.95      0.94      0.94      8237



In [48]:
#RFC using feature engineering and balancing using undersampling/oversampling

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
#ros = RandomOverSampler()
#X, y = ros.fit_resample(X, y)

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Random Forest Classifier model
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# Predict the test set using Random Forest Classifier
y_pred_rfc = rfc.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Random Forest Classifier
cr_rfc = classification_report(y_test, y_pred_rfc)
print("\nClassification report for Random Forest Classifier:\n", cr_rfc)



Computational time: 0.35631394386291504 seconds

Classification report for Random Forest Classifier:
               precision    recall  f1-score   support

           0       0.93      0.58      0.71       999
           1       0.69      0.96      0.80      1002

    accuracy                           0.77      2001
   macro avg       0.81      0.77      0.76      2001
weighted avg       0.81      0.77      0.76      2001



In [55]:
#Gaussian Mixture Model using feature engineering and balancing using undersampling

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
#ros = RandomOverSampler()
#X, y = ros.fit_resample(X, y)

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Gaussian Mixture Model
gmm = GaussianMixture(n_components=2)
gmm.fit(X_train)

# Predict the test set using Gaussian Mixture Model
y_pred_gmm = gmm.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Gaussian Mixture Model
cr_gmm = classification_report(y_test, y_pred_gmm)
print("\nClassification report for Gaussian Mixture Model:\n", cr_gmm)

Computational time: 0.10721492767333984 seconds

Classification report for Gaussian Mixture Model:
               precision    recall  f1-score   support

           0       0.32      0.49      0.39       988
           1       0.00      0.00      0.00      1013

    accuracy                           0.24      2001
   macro avg       0.16      0.25      0.20      2001
weighted avg       0.16      0.24      0.19      2001



In [56]:
# SVM using scikit-learn's SVC without sampling

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#Train the Support Vector Machine (SVM) model
svm = SVC()
svm.fit(X_train, y_train)

#Predict the test set using SVM
y_pred_svm = svm.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for SVM
cr_svm = classification_report(y_test, y_pred_svm, zero_division=1)
print("\nClassification report for Support Vector Machine (SVM):\n", cr_svm)



Computational time: 14.331180572509766 seconds

Classification report for Support Vector Machine (SVM):
               precision    recall  f1-score   support

           0       1.00      0.53      0.69       986
           1       0.94      1.00      0.97      7251

    accuracy                           0.94      8237
   macro avg       0.97      0.77      0.83      8237
weighted avg       0.95      0.94      0.94      8237



In [59]:
# SVM using scikit-learn's SVC with over sampling

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler


start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Balance the dataset using undersampling with the RandomUnderSampler
#rus = RandomUnderSampler()
#X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#Train the Support Vector Machine (SVM) model
svm = SVC()
svm.fit(X_train, y_train)

#Predict the test set using SVM
y_pred_svm = svm.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for SVM
cr_svm = classification_report(y_test, y_pred_svm, zero_division=1)
print("\nClassification report for Support Vector Machine (SVM):\n", cr_svm)

Computational time: 140.42834854125977 seconds

Classification report for Support Vector Machine (SVM):
               precision    recall  f1-score   support

           0       1.00      0.52      0.69      7262
           1       0.68      1.00      0.81      7212

    accuracy                           0.76     14474
   macro avg       0.84      0.76      0.75     14474
weighted avg       0.84      0.76      0.75     14474



In [58]:
# SVM using scikit-learn's SVC with under sampling

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


#Train the Support Vector Machine (SVM) model
svm = SVC()
svm.fit(X_train, y_train)

#Predict the test set using SVM
y_pred_svm = svm.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for SVM
cr_svm = classification_report(y_test, y_pred_svm, zero_division=1)
print("\nClassification report for Support Vector Machine (SVM):\n", cr_svm)

Computational time: 2.671963930130005 seconds

Classification report for Support Vector Machine (SVM):
               precision    recall  f1-score   support

           0       1.00      0.51      0.67       996
           1       0.67      1.00      0.80      1005

    accuracy                           0.75      2001
   macro avg       0.84      0.75      0.74      2001
weighted avg       0.84      0.75      0.74      2001



In [60]:
# Neural Networks using scikit-learn's MLPClassifier without sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the Neural Network model
nn = MLPClassifier()
nn.fit(X_train, y_train)

#Predict the test set using Neural Network
y_pred_nn = nn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for Neural Network
cr_nn = classification_report(y_test, y_pred_nn, zero_division=1)
print("\nClassification report for Neural Network:\n", cr_nn)

Computational time: 1.3677902221679688 seconds

Classification report for Neural Network:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00       921
           1       0.89      1.00      0.94      7316

    accuracy                           0.89      8237
   macro avg       0.94      0.50      0.47      8237
weighted avg       0.90      0.89      0.84      8237



In [61]:
# Neural Networks using scikit-learn's MLPClassifier with over sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the Neural Network model
nn = MLPClassifier()
nn.fit(X_train, y_train)

#Predict the test set using Neural Network
y_pred_nn = nn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for Neural Network
cr_nn = classification_report(y_test, y_pred_nn, zero_division=1)
print("\nClassification report for Neural Network:\n", cr_nn)

Computational time: 4.726892471313477 seconds

Classification report for Neural Network:
               precision    recall  f1-score   support

           0       1.00      0.52      0.69      7175
           1       0.68      1.00      0.81      7299

    accuracy                           0.76     14474
   macro avg       0.84      0.76      0.75     14474
weighted avg       0.84      0.76      0.75     14474



In [62]:
# Neural Networks using scikit-learn's MLPClassifier with under sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Train the Neural Network model
nn = MLPClassifier()
nn.fit(X_train, y_train)

#Predict the test set using Neural Network
y_pred_nn = nn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for Neural Network
cr_nn = classification_report(y_test, y_pred_nn, zero_division=1)
print("\nClassification report for Neural Network:\n", cr_nn)

Computational time: 0.5232827663421631 seconds

Classification report for Neural Network:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67       997
           1       1.00      0.00      0.00      1004

    accuracy                           0.50      2001
   macro avg       0.75      0.50      0.33      2001
weighted avg       0.75      0.50      0.33      2001



In [63]:
#Knn no sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the K-Nearest Neighbors model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for K-Nearest Neighbors:\n", cr_knn)

Computational time: 0.440356969833374 seconds

Classification report for K-Nearest Neighbors:
               precision    recall  f1-score   support

           0       0.98      0.52      0.68      1051
           1       0.93      1.00      0.97      7186

    accuracy                           0.94      8237
   macro avg       0.96      0.76      0.82      8237
weighted avg       0.94      0.94      0.93      8237



In [64]:
#Knn over sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the K-Nearest Neighbors model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for K-Nearest Neighbors:\n", cr_knn)

Computational time: 0.7375295162200928 seconds

Classification report for K-Nearest Neighbors:
               precision    recall  f1-score   support

           0       0.79      0.64      0.71      7189
           1       0.70      0.83      0.76      7285

    accuracy                           0.74     14474
   macro avg       0.75      0.74      0.73     14474
weighted avg       0.75      0.74      0.73     14474



In [65]:
#Knn under sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the K-Nearest Neighbors model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for K-Nearest Neighbors:\n", cr_knn)

Computational time: 0.1585094928741455 seconds

Classification report for K-Nearest Neighbors:
               precision    recall  f1-score   support

           0       0.71      0.69      0.70       982
           1       0.71      0.72      0.72      1019

    accuracy                           0.71      2001
   macro avg       0.71      0.71      0.71      2001
weighted avg       0.71      0.71      0.71      2001



In [71]:
#Note: Autoencoders are not classifiers, so it's not possible to calculate a classification report for them. 
#To use autoencoders in this context, you would need to first train the autoencoder to learn a compact representation
#of the input data, and then use this representation as input to another machine learning model, 
#such as a classifier, for prediction.

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the autoencoder model
autoencoder = Sequential()
autoencoder.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)))
autoencoder.add(Dense(units=32, activation='relu'))
autoencoder.add(Dense(units=64, activation='relu'))
autoencoder.add(Dense(units=X_train.shape[1]))

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder model
autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_data=(X_test, X_test))

# Use the autoencoder to encode the training and testing data
X_train_encoded = autoencoder.predict(X_train)
X_test_encoded = autoencoder.predict(X_test)

# Train the K-Nearest Neighbors model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_encoded, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test_encoded)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for Autoencoder with K-Nearest Neighbors:\n", cr_knn)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [72]:
#Autoencoder with over sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Create the autoencoder model
autoencoder = Sequential()
autoencoder.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)))
autoencoder.add(Dense(units=32, activation='relu'))
autoencoder.add(Dense(units=64, activation='relu'))
autoencoder.add(Dense(units=X_train.shape[1]))

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder model
autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_data=(X_test, X_test))

# Use the autoencoder to encode the training and testing data
X_train_encoded = autoencoder.predict(X_train)
X_test_encoded = autoencoder.predict(X_test)

# Train the K-Nearest Neighbors model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_encoded, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test_encoded)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for Autoencoder with K-Nearest Neighbors:\n", cr_knn)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100


Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Computational time: 94.15946936607361 seconds

Classification report for Autoencoder with K-Nearest Neighbors:
               precision    recall  f1-score   support

           0       1.00      0.54      0.70      1004
           1       0.94      1.00      0.97      7233

    accuracy                           0.94      8237
   macro avg       0.97      0.77      0.84      8237
weighted avg       0.95      0.94      0.94      8237



In [73]:
#Autoencoder with under sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Create the autoencoder model
autoencoder = Sequential()
autoencoder.add(Dense(units=64, activation='relu', input_shape=(X_train.shape[1],)))
autoencoder.add(Dense(units=32, activation='relu'))
autoencoder.add(Dense(units=64, activation='relu'))
autoencoder.add(Dense(units=X_train.shape[1]))

# Compile the autoencoder model
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

# Train the autoencoder model
autoencoder.fit(X_train, X_train, epochs=100, batch_size=32, validation_data=(X_test, X_test))

# Use the autoencoder to encode the training and testing data
X_train_encoded = autoencoder.predict(X_train)
X_test_encoded = autoencoder.predict(X_test)

# Train the K-Nearest Neighbors model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train_encoded, y_train)

# Predict the test set using K-Nearest Neighbors
y_pred_knn = knn.predict(X_test_encoded)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for K-Nearest Neighbors
cr_knn = classification_report(y_test, y_pred_knn)
print("\nClassification report for Autoencoder with K-Nearest Neighbors:\n", cr_knn)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [79]:
#Isolation Forest 

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the Isolation Forest Model
isolation_forest = IsolationForest(contamination=0.1)
isolation_forest.fit(X_train)

# Predict the test set using Isolation Forest Model
y_pred_isolation_forest = isolation_forest.predict(X_test)
y_pred_isolation_forest[y_pred_isolation_forest == 1] = 0
y_pred_isolation_forest[y_pred_isolation_forest == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Isolation Forest Model
cr_isolation_forest = classification_report(y_test, y_pred_isolation_forest)
print("\nClassification report for Isolation Forest Model:\n", cr_isolation_forest)



Computational time: 1.0893056392669678 seconds

Classification report for Isolation Forest Model:
               precision    recall  f1-score   support

           0       0.06      0.46      0.11       990
           1       0.34      0.04      0.07      7247

    accuracy                           0.09      8237
   macro avg       0.20      0.25      0.09      8237
weighted avg       0.31      0.09      0.07      8237



Feature names must be in the same order as they were in fit.



In [80]:
#Isolation Forest with oversampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the Isolation Forest Model
isolation_forest = IsolationForest(contamination=0.1)
isolation_forest.fit(X_train)

# Predict the test set using Isolation Forest Model
y_pred_isolation_forest = isolation_forest.predict(X_test)
y_pred_isolation_forest[y_pred_isolation_forest == 1] = 0
y_pred_isolation_forest[y_pred_isolation_forest == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Isolation Forest Model
cr_isolation_forest = classification_report(y_test, y_pred_isolation_forest)
print("\nClassification report for Isolation Forest Model:\n", cr_isolation_forest)



Computational time: 1.9424729347229004 seconds

Classification report for Isolation Forest Model:
               precision    recall  f1-score   support

           0       0.47      0.85      0.60      7252
           1       0.19      0.04      0.06      7222

    accuracy                           0.44     14474
   macro avg       0.33      0.44      0.33     14474
weighted avg       0.33      0.44      0.33     14474



In [81]:
# Isolation Forest with oversampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the Isolation Forest Model
isolation_forest = IsolationForest(contamination=0.1)
isolation_forest.fit(X_train)

# Predict the test set using Isolation Forest Model
y_pred_isolation_forest = isolation_forest.predict(X_test)
y_pred_isolation_forest[y_pred_isolation_forest == 1] = 0
y_pred_isolation_forest[y_pred_isolation_forest == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Isolation Forest Model
cr_isolation_forest = classification_report(y_test, y_pred_isolation_forest)
print("\nClassification report for Isolation Forest Model:\n", cr_isolation_forest)



Computational time: 0.4452075958251953 seconds

Classification report for Isolation Forest Model:
               precision    recall  f1-score   support

           0       0.46      0.84      0.60       992
           1       0.18      0.03      0.06      1009

    accuracy                           0.43      2001
   macro avg       0.32      0.44      0.33      2001
weighted avg       0.32      0.43      0.32      2001



In [83]:
#One-Class SVM

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM


start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the One-Class SVM Model
one_class_svm = OneClassSVM(nu=0.1)
one_class_svm.fit(X_train)

# Predict the test set using One-Class SVM Model
y_pred_one_class_svm = one_class_svm.predict(X_test)
y_pred_one_class_svm[y_pred_one_class_svm == 1] = 0
y_pred_one_class_svm[y_pred_one_class_svm == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for One-Class SVM Model
cr_one_class_svm = classification_report(y_test, y_pred_one_class_svm)
print("\nClassification report for One-Class SVM Model:\n", cr_one_class_svm)

Computational time: 14.063748836517334 seconds

Classification report for One-Class SVM Model:
               precision    recall  f1-score   support

           0       0.07      0.55      0.13       950
           1       0.65      0.11      0.19      7287

    accuracy                           0.16      8237
   macro avg       0.36      0.33      0.16      8237
weighted avg       0.58      0.16      0.18      8237



In [84]:
#One-Class SVM with over sampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the One-Class SVM Model
one_class_svm = OneClassSVM(nu=0.1)
one_class_svm.fit(X_train)

# Predict the test set using One-Class SVM Model
y_pred_one_class_svm = one_class_svm.predict(X_test)
y_pred_one_class_svm[y_pred_one_class_svm == 1] = 0
y_pred_one_class_svm[y_pred_one_class_svm == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for One-Class SVM Model
cr_one_class_svm = classification_report(y_test, y_pred_one_class_svm)
print("\nClassification report for One-Class SVM Model:\n", cr_one_class_svm)

Computational time: 46.23907470703125 seconds

Classification report for One-Class SVM Model:
               precision    recall  f1-score   support

           0       0.48      0.82      0.60      7195
           1       0.38      0.11      0.17      7279

    accuracy                           0.46     14474
   macro avg       0.43      0.47      0.39     14474
weighted avg       0.43      0.46      0.38     14474



In [85]:
#One-Class SVM with undersampling

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.svm import OneClassSVM
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Train the One-Class SVM Model
one_class_svm = OneClassSVM(nu=0.1)
one_class_svm.fit(X_train)

# Predict the test set using One-Class SVM Model
y_pred_one_class_svm = one_class_svm.predict(X_test)
y_pred_one_class_svm[y_pred_one_class_svm == 1] = 0
y_pred_one_class_svm[y_pred_one_class_svm == -1] = 1

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for One-Class SVM Model
cr_one_class_svm = classification_report(y_test, y_pred_one_class_svm)
print("\nClassification report for One-Class SVM Model:\n", cr_one_class_svm)

Computational time: 0.6981582641601562 seconds

Classification report for One-Class SVM Model:
               precision    recall  f1-score   support

           0       0.47      0.86      0.61      1005
           1       0.23      0.04      0.07       996

    accuracy                           0.45      2001
   macro avg       0.35      0.45      0.34      2001
weighted avg       0.36      0.45      0.34      2001



In [89]:
#Logistic Regression

import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict the test set using Logistic Regression
y_pred_log_reg = log_reg.predict(X_test)


#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Logistic Regression
cr_log_reg = classification_report(y_test, y_pred_log_reg, zero_division='warn')

print("\nClassification report for Logistic Regression:\n", cr_log_reg)

Computational time: 0.17967963218688965 seconds

Classification report for Logistic Regression:
               precision    recall  f1-score   support

           0       1.00      0.53      0.69      1001
           1       0.94      1.00      0.97      7236

    accuracy                           0.94      8237
   macro avg       0.97      0.76      0.83      8237
weighted avg       0.95      0.94      0.93      8237



In [95]:
#CNN with compuation time

import time
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for the CNN
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1, 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1, 1))


# Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 1), activation='relu', input_shape=(X_train.shape[1], 1, 1)))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", test_acc)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Convert the predictions back to binary labels
y_pred = (y_pred > 0.5).astype(int)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")


#Print the classification report
cr = classification_report(y_test, y_pred)
print('Classification report:')
print(cr)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.9413621425628662
Computational time: 9.7186918258667 seconds
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.52      0.69      1013
           1       0.94      1.00      0.97      7224

    accuracy                           0.94      8237
   macro avg       0.97      0.76      0.83      8237
weighted avg       0.95      0.94      0.93      8237



In [96]:
#CNN with oversampling

import time
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from imblearn.over_sampling import RandomOverSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using RandomOverSampler
ros = RandomOverSampler()
X, y = ros.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for the CNN
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1, 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1, 1))


# Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 1), activation='relu', input_shape=(X_train.shape[1], 1, 1)))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", test_acc)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Convert the predictions back to binary labels
y_pred = (y_pred > 0.5).astype(int)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")


#Print the classification report
cr = classification_report(y_test, y_pred)
print('Classification report:')
print(cr)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.7558380365371704
Computational time: 16.559269666671753 seconds
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.51      0.68      7245
           1       0.67      1.00      0.80      7229

    accuracy                           0.76     14474
   macro avg       0.84      0.76      0.74     14474
weighted avg       0.84      0.76      0.74     14474



In [97]:
#CNN with undersampling

import time
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X, y = rus.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Reshape the data for the CNN
X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1, 1))
X_test = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1, 1))


# Define the CNN model
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 1), activation='relu', input_shape=(X_train.shape[1], 1, 1)))
model.add(MaxPooling2D(pool_size=(2, 1)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)
print("Test Accuracy:", test_acc)

#Make predictions on the test set
y_pred = model.predict(X_test)

#Convert the predictions back to binary labels
y_pred = (y_pred > 0.5).astype(int)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")


#Print the classification report
cr = classification_report(y_test, y_pred)
print('Classification report:')
print(cr)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.757621169090271
Computational time: 3.2527387142181396 seconds
Classification report:
              precision    recall  f1-score   support

           0       1.00      0.53      0.69      1037
           1       0.67      1.00      0.80       964

    accuracy                           0.76      2001
   macro avg       0.83      0.77      0.75      2001
weighted avg       0.84      0.76      0.74      2001



In [113]:
#Clustering with KMeans
from sklearn.cluster import KMeans
import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer

start_time = time.time()

#Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

#Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

#Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

#Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])

#Train the KMeans model
kmeans = KMeans(n_clusters=2)
kmeans.fit(X)

#Predict the target variable using KMeans
y_pred_kmeans = kmeans.predict(X)

#Calculate computational time
end_time = time.time()
computational_time = end_time - start_time
print("Computational time:", computational_time, "seconds")


#Calculate the precision, recall, and f1-score for KMeans
cr_kmeans = classification_report(y, y_pred_kmeans)
print("\nClassification report for KMeans:\n", cr_kmeans)







Computational time: 0.14107084274291992 seconds

Classification report for KMeans:
               precision    recall  f1-score   support

           0       0.06      0.48      0.11      5001
           1       0.00      0.00      0.00     36183

    accuracy                           0.06     41184
   macro avg       0.03      0.24      0.06     41184
weighted avg       0.01      0.06      0.01     41184



In [33]:
#RNN

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Reshape the input data for the RNN
X_train = np.array(X_train).reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = np.array(X_test).reshape((X_test.shape[0], X_test.shape[1], 1))

#Initialize the RNN model
model = Sequential()

#Add a simple RNN layer with 128 hidden units
model.add(SimpleRNN(128, input_shape=(X_train.shape[1], X_train.shape[2])))

#Add a dropout layer to prevent overfitting
model.add(Dropout(0.2))

#Add a dense layer for prediction
model.add(Dense(1, activation='sigmoid'))

#Compile the RNN model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

#Train the RNN model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

#Predict the test set using the RNN model
y_pred_rnn = model.predict(X_test)
y_pred_rnn = np.round(y_pred_rnn).astype(int).flatten()

#Calculate computation time
end_time = time.time()

computational_time = end_time - start_time

print("Computational time:", computational_time, "seconds")

#Calculate the precision, recall, and f1-score for the RNN model
cr_rnn = classification_report(y_test, y_pred_rnn, zero_division=1)
print("\nClassification report for Recurrent Neural Network:\n", cr_rnn)







Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Computational time: 17.80119276046753 seconds

Classification report for Recurrent Neural Network:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00      1012
           1       0.88      1.00      0.93      7225

    accuracy                           0.88      8237
   macro avg       0.94      0.50      0.47      8237
weighted avg       0.89      0.88      0.82      8237



In [27]:
#Gradient Boosting

import time
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
from sklearn.ensemble import GradientBoostingClassifier

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train the Gradient Boosting Classifier model
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

# Predict the test set using Gradient Boosting Classifier
y_pred_gbc = gbc.predict(X_test)

#Calculate computation time
end_time = time.time()

computational_time = end_time - start_time

print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for Gradient Boosting Classifier
cr_gbc = classification_report(y_test, y_pred_gbc)
print("\nClassification report for Gradient Boosting Classifier:\n", cr_gbc)


Computational time: 0.8569958209991455 seconds

Classification report for Gradient Boosting Classifier:
               precision    recall  f1-score   support

           0       1.00      0.52      0.69       942
           1       0.94      1.00      0.97      7295

    accuracy                           0.95      8237
   macro avg       0.97      0.76      0.83      8237
weighted avg       0.95      0.95      0.94      8237



In [34]:
#Deep Belief Networks (DBN) 

import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create the DBN model
dbn = Pipeline(steps=[('rbm', BernoulliRBM(n_components=50, learning_rate=0.01, n_iter=20, random_state=0)),
                      ('logistic', LogisticRegression(C=1000.0, max_iter=10000))])

# Train the DBN model
dbn.fit(X_train, y_train)

# Predict the test set using DBN
y_pred_dbn = dbn.predict(X_test)

#Calculate computation time
end_time = time.time()

computational_time = end_time - start_time

print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for DBN
cr_dbn = classification_report(y_test, y_pred_dbn, zero_division=1)
print("\nClassification report for Deep Belief Networks:\n", cr_dbn)


Computational time: 7.402517557144165 seconds

Classification report for Deep Belief Networks:
               precision    recall  f1-score   support

           0       1.00      0.00      0.00      1001
           1       0.88      1.00      0.94      7236

    accuracy                           0.88      8237
   macro avg       0.94      0.50      0.47      8237
weighted avg       0.89      0.88      0.82      8237



In [36]:
#Deep Belief Networks (DBN) 

import time
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.impute import SimpleImputer
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler

start_time = time.time()

# Load the data from the csv file into a pandas DataFrame
goose_df = pd.read_csv('goose_data_new.csv')

# Encode the categorical variables
label_encoder = LabelEncoder()
goose_df['source'] = label_encoder.fit_transform(goose_df['source'])
goose_df['destination'] = label_encoder.fit_transform(goose_df['destination'])
goose_df['gooseid'] = label_encoder.fit_transform(goose_df['gooseid'])

# Impute missing values
imputer = SimpleImputer(strategy='median')
goose_df['stNum'] = imputer.fit_transform(goose_df[['stNum']])

# Create new features
goose_df['goose_interaction'] = goose_df['stNum'] * goose_df['gooseid']

# Split the data into features (X) and target (y)
X = goose_df.drop(columns=["time", "sqNum", "gooseboolean", "goosebitstring", "label"])
y = goose_df["label"]

# Balance the dataset using undersampling with the RandomUnderSampler
rus = RandomUnderSampler()
X_resampled, y_resampled = rus.fit_resample(X, y)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

# Create the DBN model
dbn = Pipeline(steps=[('rbm', BernoulliRBM(n_components=50, learning_rate=0.01, n_iter=20, random_state=0)),
                      ('logistic', LogisticRegression(C=1000.0, max_iter=10000))])

# Train the DBN model
dbn.fit(X_train, y_train)

# Predict the test set using DBN
y_pred_dbn = dbn.predict(X_test)

#Calculate computation time
end_time = time.time()

computational_time = end_time - start_time

print("Computational time:", computational_time, "seconds")

# Calculate the precision, recall, and f1-score for DBN
cr_dbn = classification_report(y_test, y_pred_dbn, zero_division=1)
print("\nClassification report for Deep Belief Networks:\n", cr_dbn)

Computational time: 1.8017489910125732 seconds

Classification report for Deep Belief Networks:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67       998
           1       1.00      0.00      0.00      1003

    accuracy                           0.50      2001
   macro avg       0.75      0.50      0.33      2001
weighted avg       0.75      0.50      0.33      2001

