In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, accuracy_score
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [4]:
df = pd.read_csv("Modeling_4-23.csv")

## Encoding

In [7]:
ohe = OneHotEncoder()
df = pd.get_dummies(df, columns=['Degree'], dtype=int)

## Splitting Data

In [10]:
X = df.loc[:, df.columns!='Depression']
y = df['Depression']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

## Grid Search (Recall)

In [13]:
RFC = RandomForestClassifier()

params = {
    'n_estimators': [100, 300],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt', 0.8],
    'bootstrap': [True],
    'class_weight': ['balanced'],
    'criterion': ['gini', 'entropy']
}
 
grid_search = GridSearchCV(
    estimator=RFC,
    param_grid=params,
    cv=5,
    scoring='recall',   
    n_jobs=-1,
    verbose=2
)

In [15]:
grid_search.fit(X_train, y_train)

print("\nBest Parameters Found: ", grid_search.best_params_)
print("\nBest Cross-Validation Accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits

Best Parameters Found:  {'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best Cross-Validation Accuracy:  0.8844325153374234


In [16]:
best_RF = grid_search.best_estimator_

y_pred_best_RF = best_RF.predict(X_test)

In [17]:
print(classification_report(y_test, y_pred_best_RF))

cm = confusion_matrix(y_test,  y_pred_best_RF)

print("Confusion Matrix (raw array):")
print(cm)

              precision    recall  f1-score   support

           0       0.84      0.77      0.80      2316
           1       0.84      0.89      0.87      3253

    accuracy                           0.84      5569
   macro avg       0.84      0.83      0.83      5569
weighted avg       0.84      0.84      0.84      5569

Confusion Matrix (raw array):
[[1779  537]
 [ 345 2908]]


## Cross Validation (Recall)

In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_RF_cv = cross_val_score(best_RF, X_train, y_train, cv=cv, scoring='recall')
print(f"Fold Metrics:\n{best_RF_cv}\n")
print(f"Average Score:\n{best_RF_cv.mean()}\n")

Fold Metrics:
[0.88842025 0.88343558 0.88995399 0.87730061 0.88305215]

Average Score:
0.8844325153374234



In [20]:
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_recall = 0
best_confusion_matrix = None
fold_recalls = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = clone(best_RF)
    model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_val)
    recall = recall_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    fold_recalls.append(recall)
    
    print(f"Fold {fold} Recall: {recall:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")
    
    if recall > best_recall:
        best_recall = recall
        best_confusion_matrix = cm

print(f"All Fold Recalls: {fold_recalls}")
print(f"Mean Recall: {np.mean(fold_recalls):.4f}")
print(f"Best Fold Recall: {best_recall:.4f}")
print(f"Best Confusion Matrix:\n{best_confusion_matrix}")

Fold 1 Recall: 0.8900
Confusion Matrix:
[[1451  397]
 [ 287 2321]]

Fold 2 Recall: 0.8877
Confusion Matrix:
[[1433  414]
 [ 293 2315]]

Fold 3 Recall: 0.8892
Confusion Matrix:
[[1400  447]
 [ 289 2319]]

Fold 4 Recall: 0.8785
Confusion Matrix:
[[1430  417]
 [ 317 2291]]

Fold 5 Recall: 0.8827
Confusion Matrix:
[[1483  364]
 [ 306 2302]]

All Fold Recalls: [0.8899539877300614, 0.8876533742331288, 0.8891871165644172, 0.8784509202453987, 0.8826687116564417]
Mean Recall: 0.8856
Best Fold Recall: 0.8900
Best Confusion Matrix:
[[1451  397]
 [ 287 2321]]
