In [21]:
import os
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    roc_auc_score, roc_curve, auc, _scorer
)
from sklearn.tree import export_graphviz
import xgboost as xgb
from xgboost import XGBClassifier as xgbclass
from scipy import stats
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    make_scorer, precision_score, recall_score, f1_score,
    classification_report
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.feature_selection import RFECV
import pickle

root_path = "../../Data/GoogleDrive/"

In [22]:
# Load data from parquet files
X_train = pd.read_parquet(root_path + "X_train.parquet")
X_test = pd.read_parquet(root_path + "X_test.parquet")
y_train = pd.read_parquet(root_path + "y_train.parquet")
y_test = pd.read_parquet(root_path + "y_test.parquet")

In [23]:
# start_time = time.time()

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=69)

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [300, 600, 1000],
    'max_depth': [10, 20, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['sqrt', 'log2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, 
                           param_grid=param_grid, 
                           cv=5, 
                           scoring='roc_auc', 
                           n_jobs=8) # Parallel

y_train = y_train.to_numpy().ravel()

# Perform GridSearchCV to find the best parameters
#grid_search.fit(X_train, y_train)
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_rf_params = grid_search.best_params_
best_rf_score = grid_search.best_score_

print("Best Parameters:", best_rf_params)
print("Best AUC ROC Score:", best_rf_score)

# Use the best model to make predictions on the testing data
best_rf_classifier = grid_search.best_estimator_
rf_predictions = best_rf_classifier.predict(X_test)
rf_predcitions_prob = best_rf_classifier.predict_proba(X_test)[:, 1]

# end_time = time.time()

# execution_time = end_time - start_time
# print("Execution time:", execution_time, "seconds")



Best Parameters: {'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 1000}
Best AUC ROC Score: 0.8395325813034636


In [24]:
# Save Model
with open(root_path + "rf_model_2.pkl", 'wb') as file:
    pickle.dump(grid_search, file)

In [25]:
# Accuracy
rf_accuracy = accuracy_score(y_test, rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)

# Precision
rf_precision = precision_score(y_test, rf_predictions)
print("Random Forest Precision:", rf_precision)

# Recall
rf_recall = recall_score(y_test, rf_predictions)
print("Random Forest Recall:", rf_recall)

# F1 Score
rf_f1 = f1_score(y_test, rf_predictions)
print("Random Forest F1 Score:", rf_f1)

# AUC ROC
rf_auc_roc = roc_auc_score(y_test, rf_predcitions_prob)
print("Random Forest AUC ROC:", rf_auc_roc)

# Classification Report
rf_classification_report = classification_report(y_test, rf_predictions)
print("Random Forest Classification Report:")
print(rf_classification_report)

Random Forest Accuracy: 0.801519936833794
Random Forest Precision: 0.6545293072824157
Random Forest Recall: 0.43643900513225425
Random Forest F1 Score: 0.5236854571293226
Random Forest AUC ROC: 0.840629048578763
Random Forest Classification Report:
              precision    recall  f1-score   support

         0.0       0.83      0.92      0.87     30396
         1.0       0.65      0.44      0.52     10132

    accuracy                           0.80     40528
   macro avg       0.74      0.68      0.70     40528
weighted avg       0.79      0.80      0.79     40528



# New Models

In [26]:
# Load new models from pickles
with open(root_path + "rf_model_fixed.pkl", 'rb') as file:
    rf_model_fixed = pickle.load(file)

with open(root_path + "xgboost_model_fixed.pkl", 'rb') as file:
    xgboost_model_fixed = pickle.load(file)

In [31]:
# Best parameters for Random Forest
print("Best Parameters for Random Forest:")
print(rf_model_fixed.best_params_)

# Best parameters for XGBoost
print("Best Parameters for XGBoost:")
print(xgboost_model_fixed.best_params_)

Best Parameters for Random Forest:
{'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 2, 'n_estimators': 1000}
Best Parameters for XGBoost:
{'learning_rate': 0.1, 'max_depth': 5, 'subsample': 0.7}


In [28]:
# Metrics for the fixed models
rf_predictions_fixed = rf_model_fixed.predict(X_test)
rf_predcitions_prob_fixed = rf_model_fixed.predict_proba(X_test)[:, 1]

rf_accuracy_fixed = accuracy_score(y_test, rf_predictions_fixed)
rf_precision_fixed = precision_score(y_test, rf_predictions_fixed)
rf_recall_fixed = recall_score(y_test, rf_predictions_fixed)
rf_f1_fixed = f1_score(y_test, rf_predictions_fixed)
rf_auc_roc_fixed = roc_auc_score(y_test, rf_predcitions_prob_fixed)
rf_classification_report_fixed = classification_report(y_test, rf_predictions_fixed)

xgboost_predictions_fixed = xgboost_model_fixed.predict(X_test)
xgboost_predcitions_prob_fixed = xgboost_model_fixed.predict_proba(X_test)[:, 1]

xgboost_accuracy_fixed = accuracy_score(y_test, xgboost_predictions_fixed)
xgboost_precision_fixed = precision_score(y_test, xgboost_predictions_fixed)
xgboost_recall_fixed = recall_score(y_test, xgboost_predictions_fixed)
xgboost_f1_fixed = f1_score(y_test, xgboost_predictions_fixed)
xgboost_auc_roc_fixed = roc_auc_score(y_test, xgboost_predcitions_prob_fixed)
xgboost_classification_report_fixed = classification_report(y_test, xgboost_predictions_fixed)

{'Accuracy': 0.8395430319778918, 'Precision': 0.7378424433084284, 'Recall': 0.5555665219107777, 'F1 Score': 0.6338607060413265, 'AUC ROC': 0.8963825743377312}
{'Accuracy': 0.8046535728385313, 'Precision': 0.6440744113438273, 'Recall': 0.4886498223450454, 'F1 Score': 0.5556989730063415, 'AUC ROC': 0.8478938048682513}


In [32]:
print(rf_classification_report_fixed)
print(xgboost_classification_report_fixed)

              precision    recall  f1-score   support

         0.0       0.86      0.93      0.90     30396
         1.0       0.74      0.56      0.63     10132

    accuracy                           0.84     40528
   macro avg       0.80      0.74      0.77     40528
weighted avg       0.83      0.84      0.83     40528

              precision    recall  f1-score   support

         0.0       0.84      0.91      0.87     30396
         1.0       0.64      0.49      0.56     10132

    accuracy                           0.80     40528
   macro avg       0.74      0.70      0.72     40528
weighted avg       0.79      0.80      0.80     40528



In [30]:
# Seperate tables for the metrics of the models fixed 

# Random Forest
rf_metrics_fixed = {
    "Accuracy": rf_accuracy_fixed,
    "Precision": rf_precision_fixed,
    "Recall": rf_recall_fixed,
    "F1 Score": rf_f1_fixed,
    "AUC ROC": rf_auc_roc_fixed
}
rf_metrics_fixed_df = pd.DataFrame(rf_metrics_fixed, index=["Random Forest"])
print(rf_metrics_fixed_df)

# XGBoost
xgboost_metrics_fixed = {
    "Accuracy": xgboost_accuracy_fixed,
    "Precision": xgboost_precision_fixed,
    "Recall": xgboost_recall_fixed,
    "F1 Score": xgboost_f1_fixed,
    "AUC ROC": xgboost_auc_roc_fixed
}
xgboost_metrics_fixed_df = pd.DataFrame(xgboost_metrics_fixed, index=["XGBoost"])
print(xgboost_metrics_fixed_df)

               Accuracy  Precision    Recall  F1 Score   AUC ROC
Random Forest  0.839543   0.737842  0.555567  0.633861  0.896383
         Accuracy  Precision   Recall  F1 Score   AUC ROC
XGBoost  0.804654   0.644074  0.48865  0.555699  0.847894
