# Extra Model Training
## **Disclaimer**: This notebook is to be run in Google Colab with GPU.

In order to run this notebook you will need to have the file processed_consolidated_data.csv and processed_filtered_data.csv in your current directory.

The goal of this notebook is to run the models trainings that were note possible to do in the Modelling notebook, because there wasn't enough computer power.

This notebook should output a csv file with the evaluation metrics that will be then loaded by the Modelling notebook to compare results.

We do RandomSearchCV on both datasets and on the models: RandomForestClassifier and Gradient Boosting.
In addition to this, we will also run the AutoEncoder model for both datasets.

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


from sklearn.metrics import (accuracy_score, recall_score,
                      confusion_matrix, classification_report)

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

In [2]:
results_list = []

### 1st dataset consolidated_data

In [3]:
# Step: Load your dataset
# Assume X and y are loaded properly
filtered_df = pd.read_csv("processed_consolidated_data.csv")
X = filtered_df.drop("anomaly", axis=1)
y = filtered_df["anomaly"]

In [4]:
# Step: Prepare the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# Applying SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_con_resampled, y_train_con_resampled = smote.fit_resample(X_train, y_train)

#### RandomForest Classifier Model

In [6]:
# Parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method for sampling data points (with or without replacement)
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Randomized Search CV
random_search_rf_1 = RandomizedSearchCV(estimator=rf_classifier,
                                   param_distributions=param_grid,
                                   scoring="recall",
                                   cv=5,  # Adjust cross-validation as needed
                                   n_iter=10,  # Number of random combinations to try
                                   random_state=42,
                                   n_jobs=-1)  # Use all available CPU cores

# Fit Randomized Search
random_search_rf_1.fit(X_train_con_resampled, y_train_con_resampled)

# Predict anomalies on the test set
y_pred_rf = random_search_rf_1.predict(X_test)

results_dict = {"Model":"RandomForest with RandomizedSearchCV",
                "Dataset":"Consolidated",
                "Accuracy":accuracy_score(y_test, y_pred_rf),
                "Recall":recall_score(y_test, y_pred_rf)}

results_list.append(results_dict)

# Evaluate the model
print("\nRandom Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}\n")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_rf)}\n")
print(f"Classification Report:\n{classification_report(y_test, y_pred_rf, zero_division=0)}")

  pid = os.fork()
  pid = os.fork()



Random Forest Classifier:
Accuracy: 0.9136690647482014

Confusion Matrix:
[[978  39]
 [ 57  38]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.96      0.95      1017
           1       0.49      0.40      0.44        95

    accuracy                           0.91      1112
   macro avg       0.72      0.68      0.70      1112
weighted avg       0.91      0.91      0.91      1112



#### Gradient Boosting Model

In [9]:
# Parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}


# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Randomized Search CV
random_search_gb_1 = RandomizedSearchCV(estimator=gb_classifier,
                                   param_distributions=param_grid,
                                   scoring="recall",
                                   cv=5,  # Adjust cross-validation as needed
                                   n_iter=10,  # Number of random combinations to try
                                   random_state=42,
                                   n_jobs=-1)  # Use all available CPU cores

# Fit Randomized Search
random_search_gb_1.fit(X_train_con_resampled, y_train_con_resampled)

# Predict anomalies on the test set
y_pred_gb = random_search_gb_1.predict(X_test)

results_dict = {"Model":"Gradient Boosting with RandomizedSearchCV",
                "Dataset":"Consolidated",
                "Accuracy":accuracy_score(y_test, y_pred_gb),
                "Recall":recall_score(y_test, y_pred_gb)}

results_list.append(results_dict)

# Evaluate the model
print("\Gradient Boosting Model:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}\n")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_gb)}\n")
print(f"Classification Report:\n{classification_report(y_test, y_pred_gb, zero_division=0)}")

  pid = os.fork()


\Gradient Boosting Model:
Accuracy: 0.9064748201438849

Confusion Matrix:
[[971  46]
 [ 58  37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.95      0.95      1017
           1       0.45      0.39      0.42        95

    accuracy                           0.91      1112
   macro avg       0.69      0.67      0.68      1112
weighted avg       0.90      0.91      0.90      1112



#### Autoencoder Model

In [10]:
# Step 4: Build the Autoencoder
input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
encoder = Dense(int(encoding_dim / 4), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 4), activation='relu')(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(decoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [11]:
# Step 5: Train the Autoencoder
X_train_normal = X_train[y_train == 0]

history = autoencoder.fit(X_train_normal, X_train_normal,
                          epochs=50,
                          batch_size=32,
                          validation_data=(X_test[y_test == 0], X_test[y_test == 0]),
                          verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [12]:
# Step 6: Set a Threshold for Reconstruction Error
reconstructions = autoencoder.predict(X_train_normal)
reconstruction_errors = np.mean(np.square(reconstructions - X_train_normal), axis=1)
threshold = np.percentile(reconstruction_errors, 63)




In [14]:
# Step 7: Detect Anomalies on Test Data
test_reconstructions = autoencoder.predict(X_test)
test_reconstruction_errors = np.mean(np.square(test_reconstructions - X_test), axis=1)
y_pred = (test_reconstruction_errors > threshold).astype(int)



In [15]:
# Step 8: Evaluate the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.5881294964028777
Confusion Matrix:
 [[610 407]
 [ 51  44]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.60      0.73      1017
           1       0.10      0.46      0.16        95

    accuracy                           0.59      1112
   macro avg       0.51      0.53      0.44      1112
weighted avg       0.85      0.59      0.68      1112



In [16]:
results_dict = {"Model":"AutoEncoder",
                "Dataset":"Consolidated",
                "Accuracy":accuracy_score(y_test, y_pred),
                "Recall":recall_score(y_test, y_pred)}

results_list.append(results_dict)

### 2nd dataset filtered_data

In [17]:
# Step: Load your dataset
# Assume X and y are loaded properly
filtered_df = pd.read_csv("processed_filtered_data.csv")
X = filtered_df.drop("anomaly", axis=1)
y = filtered_df["anomaly"]

In [18]:
# Step: Prepare the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# Applying SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_filter_resampled, y_train_filter_resampled = smote.fit_resample(X_train, y_train)

#### Random Forest Classifier Model

In [20]:
# Parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Method for sampling data points (with or without replacement)
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Randomized Search CV
random_search_rf_2 = RandomizedSearchCV(estimator=rf_classifier,
                                   param_distributions=param_grid,
                                   scoring="recall",
                                   cv=5,  # Adjust cross-validation as needed
                                   n_iter=50,  # Number of random combinations to try
                                   random_state=42,
                                   n_jobs=-1)  # Use all available CPU cores

# Fit Randomized Search
random_search_rf_2.fit(X_train_filter_resampled, y_train_filter_resampled)

# Predict anomalies on the test set
y_pred_rf = random_search_rf_2.predict(X_test)

results_dict = {"Model":"RandomForest with RandomizedSearchCV",
                "Dataset":"Filtered",
                "Accuracy":accuracy_score(y_test, y_pred_rf),
                "Recall":recall_score(y_test, y_pred_rf)}

results_list.append(results_dict)

# Evaluate the model
print("\nRandom Forest Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}\n")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_rf)}\n")
print(f"Classification Report:\n{classification_report(y_test, y_pred_rf, zero_division=0)}")

  warn(



Random Forest Classifier:
Accuracy: 0.8803956834532374

Confusion Matrix:
[[940  77]
 [ 56  39]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1017
           1       0.34      0.41      0.37        95

    accuracy                           0.88      1112
   macro avg       0.64      0.67      0.65      1112
weighted avg       0.89      0.88      0.89      1112



#### Gradient Boosting Model

In [21]:
# Parameter grid for Gradient Boosting
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider at every split
    'max_depth': [None, 10, 20],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
}

# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(random_state=42)

# Randomized Search CV
random_search_gb_2 = RandomizedSearchCV(estimator=gb_classifier,
                                   param_distributions=param_grid,
                                   scoring="recall",
                                   cv=5,  # Adjust cross-validation as needed
                                   n_iter=50,  # Number of random combinations to try
                                   random_state=42,
                                   n_jobs=-1)  # Use all available CPU cores

# Fit Randomized Search
random_search_gb_2.fit(X_train_filter_resampled, y_train_filter_resampled)

# Predict anomalies on the test set
y_pred_gb = random_search_gb_2.predict(X_test)

results_dict = {"Model":"Gradient Boosting with RandomizedSearchCV",
                "Dataset":"Filtered",
                "Accuracy":accuracy_score(y_test, y_pred_gb),
                "Recall":recall_score(y_test, y_pred_gb)}

results_list.append(results_dict)

# Evaluate the model
print("\nGradient Boosting Classifier:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb)}\n")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_gb)}\n")
print(f"Classification Report:\n{classification_report(y_test, y_pred_gb, zero_division=0)}")


Random Forest Classifier:
Accuracy: 0.8794964028776978

Confusion Matrix:
[[942  75]
 [ 59  36]]

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1017
           1       0.32      0.38      0.35        95

    accuracy                           0.88      1112
   macro avg       0.63      0.65      0.64      1112
weighted avg       0.89      0.88      0.88      1112



#### AutoEncoder Model

In [22]:
# Step 4: Build the Autoencoder
input_dim = X_train.shape[1]
encoding_dim = 14

input_layer = Input(shape=(input_dim,))
encoder = Dense(encoding_dim, activation="relu")(input_layer)
encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
encoder = Dense(int(encoding_dim / 4), activation="relu")(encoder)
decoder = Dense(int(encoding_dim / 4), activation='relu')(encoder)
decoder = Dense(int(encoding_dim / 2), activation='relu')(decoder)
decoder = Dense(input_dim, activation='sigmoid')(decoder)

autoencoder = Model(inputs=input_layer, outputs=decoder)
autoencoder.compile(optimizer='adam', loss='mean_squared_error')

In [23]:
# Step 5: Train the Autoencoder
X_train_normal = X_train[y_train == 0]

history = autoencoder.fit(X_train_normal, X_train_normal,
                          epochs=50,
                          batch_size=32,
                          validation_data=(X_test[y_test == 0], X_test[y_test == 0]),
                          verbose=1)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [36]:
# Step 6: Set a Threshold for Reconstruction Error
reconstructions = autoencoder.predict(X_train_normal)
reconstruction_errors = np.mean(np.square(reconstructions - X_train_normal), axis=1)
threshold = np.percentile(reconstruction_errors, 63)




In [37]:
# Step 7: Detect Anomalies on Test Data
test_reconstructions = autoencoder.predict(X_test)
test_reconstruction_errors = np.mean(np.square(test_reconstructions - X_test), axis=1)
y_pred = (test_reconstruction_errors > threshold).astype(int)



In [38]:
# Step 8: Evaluate the Model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

Accuracy: 0.5953237410071942
Confusion Matrix:
 [[627 390]
 [ 60  35]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.62      0.74      1017
           1       0.08      0.37      0.13        95

    accuracy                           0.60      1112
   macro avg       0.50      0.49      0.44      1112
weighted avg       0.84      0.60      0.68      1112



In [39]:
results_dict = {"Model":"AutoEncoder",
                "Dataset":"Filtered",
                "Accuracy":accuracy_score(y_test, y_pred),
                "Recall":recall_score(y_test, y_pred)}

results_list.append(results_dict)

### Saving the results to csv

In [42]:
results_df = pd.DataFrame(results_list)
results_df.to_csv("results_to_add.csv", index=False)