In [1]:
import os
import sys
import random
import numpy as np
import pandas as pd
import matplotlib as mpl

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.calibration import CalibrationDisplay

import joblib

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.feature_selection import SelectKBest, mutual_info_classif, VarianceThreshold, RFE, RFECV
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import (
    train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
)

from sklearn.metrics import (
    accuracy_score, roc_auc_score, brier_score_loss, 
    precision_score, recall_score, f1_score
)
from sklearn.calibration import calibration_curve, CalibratedClassifierCV


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from IPython.display import display

# Pandas configuration 
pd.set_option('display.max_rows', 100)

# To plot pretty figures 
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=10)
mpl.rc('ytick', labelsize=10)

### Create a folder and define the save_fig() function which is used through 
### this notebook to save the figures in hig-res ####

PROJECT_ROOT_DIR = r"C:\Users\aleks\OneDrive - Coventry University\Desktop\Project_Data"
CHAPTER_ID = "project_figures"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
X_train_final = joblib.load('X_train_final_MI.joblib')
y_train_final = joblib.load('y_train_final_MI.joblib')
X_test_final = joblib.load('X_test_final_MI.joblib')
y_test_final = joblib.load('y_test_final_MI.joblib')

In [3]:
print(X_train_final.shape)
print(X_test_final.shape)
print(y_train_final.shape)
print(y_test_final.shape)

(20409, 20)
(5103, 20)
(20409,)
(5103,)


In [4]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, 
                                              class_weight='balanced',
                                              random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42,
                                           class_weight='balanced'),
    "AdaBoost": AdaBoostClassifier(random_state=42),
    "MLP": MLPClassifier(max_iter=1000, random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss')
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for name, model in models.items():
    print(f"Cross validation: {name}")
    aucs, f1s, precisions, recalls, briers, accuracies = [], [], [], [], [], []
    
    for train_idx, val_idx in cv.split(X_train_final, y_train_final):
        # Split the training fold
        X_tr, X_val = X_train_final.iloc[train_idx], X_train_final.iloc[val_idx]
        y_tr, y_val = y_train_final.iloc[train_idx], y_train_final.iloc[val_idx]
        
        # Apply SMOTE only to training fold (not validation)
        smote = SMOTE(random_state=42)
        X_tr_resampled, y_tr_resampled = smote.fit_resample(X_tr, y_tr)
        
        # Train on resampled training data, validate on original validation data
        model.fit(X_tr_resampled, y_tr_resampled)
        y_pred = model.predict(X_val)
        y_proba = model.predict_proba(X_val)[:, 1]
        
        # Calculate metrics
        aucs.append(roc_auc_score(y_val, y_proba))
        f1s.append(f1_score(y_val, y_pred))
        precisions.append(precision_score(y_val, y_pred))
        recalls.append(recall_score(y_val, y_pred))
        accuracies.append(accuracy_score(y_val, y_pred))
        briers.append(brier_score_loss(y_val, y_proba))
    
    results.append({
        "Model": name,
        "AUC Mean": np.mean(aucs),
        "F1 Mean": np.mean(f1s),
        "Precision Mean": np.mean(precisions),
        "Recall Mean": np.mean(recalls),
        "Accuracy Mean": np.mean(accuracies),
        "Brier Score Mean": np.mean(briers),
    })

Cross validation: Logistic Regression
Cross validation: Random Forest
Cross validation: AdaBoost
Cross validation: MLP
Cross validation: XGBoost


In [5]:
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv(r"C:\Users\aleks\OneDrive - Coventry University\Desktop\Project_Data\'model_cv_results_SMOTE.csv", index=False)

                 Model  AUC Mean   F1 Mean  Precision Mean  Recall Mean  \
0  Logistic Regression  0.812711  0.541487        0.445465     0.690531   
1        Random Forest  0.816701  0.503982        0.622524     0.423881   
2             AdaBoost  0.820071  0.558803        0.500375     0.633674   
3                  MLP  0.769147  0.495754        0.413938     0.618462   
4              XGBoost  0.810597  0.485523        0.613798     0.401996   

   Accuracy Mean  Brier Score Mean  
0       0.759076          0.166199  
1       0.828311          0.125295  
2       0.793866          0.205761  
3       0.740947          0.179477  
4       0.824587          0.127050  
