In [51]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score
import numpy as np
import joblib  # for saving the best model


## Baseline

In [2]:
df = pd.read_csv("Modeling_4-23.csv")

In [3]:
ohe = OneHotEncoder()
df = pd.get_dummies(df, columns=['Degree'], dtype=int)

In [7]:
X = df.loc[:, df.columns!='Depression']
y = df['Depression']

In [9]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
RFC = RandomForestClassifier()

In [13]:
RFC.fit(X_train, y_train)

In [15]:
y_prediction = RFC.predict(X_test)

In [17]:
print(classification_report(y_test, y_prediction))

              precision    recall  f1-score   support

           0       0.84      0.78      0.80      2316
           1       0.85      0.89      0.87      3253

    accuracy                           0.84      5569
   macro avg       0.84      0.83      0.84      5569
weighted avg       0.84      0.84      0.84      5569



In [21]:
print(classification_report(y_test, y_prediction))

cm = confusion_matrix(y_test, y_prediction)

print("Confusion Matrix (raw array):")
print(cm)

Confusion Matrix (raw array):
[[1799  517]
 [ 355 2898]]


## Binning Degree

In [29]:
df_bin_degree = pd.read_csv("Modeling_4-23.csv")

In [31]:
High_School = ["'Class 12'"]
Bachelor = ['B.Pharm','BSc','BA','BCA','B.Ed','LLB','BE','BHM','B.Com','B.Arch','B.Tech','BBA']
Graduate = ['M.Tech','M.Ed','MSc','M.Pharm','MCA','MA','MBA','MBBS','M.Com','ME','MHM','LLM']
Professional = ['PhD','MD']

In [34]:
def categorize_degree(deg):
    if deg in High_School:
        return 'High School'
    elif deg in Bachelor:
        return 'Bachelor'
    elif deg in Graduate:
        return 'Graduate'
    elif deg in Professional:
        return 'Professional'


df_bin_degree['Education'] = df_bin_degree['Degree'].apply(categorize_degree)

In [36]:
df_bin_degree = df_bin_degree.drop(columns="Degree")

In [42]:
df_bin_degree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27845 entries, 0 to 27844
Data columns (total 31 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   Age                                        27845 non-null  int64  
 1   Academic Pressure                          27845 non-null  int64  
 2   CGPA                                       27845 non-null  float64
 3   Study Satisfaction                         27845 non-null  int64  
 4   Work/Study Hours                           27845 non-null  int64  
 5   Financial Stress                           27845 non-null  int64  
 6   Sleep Duration_encoded                     27845 non-null  int64  
 7   Dietary Habits_encoded                     27845 non-null  int64  
 8   Depression                                 27845 non-null  int64  
 9   Have you ever had suicidal thoughts ?_No   27845 non-null  int64  
 10  Have you ever had suic

In [46]:
ohe_ = OneHotEncoder()
df_bin_degree = pd.get_dummies(df_bin_degree, columns=['Education'], dtype=int)

In [50]:
X_bd = df_bin_degree.loc[:, df_bin_degree.columns!='Depression']
y_bd = df_bin_degree['Depression']

In [52]:
X_train_bd, X_test_bd, y_train_bd, y_test_bd = train_test_split(X_bd, y_bd, test_size=0.2, random_state=42)

In [54]:
RFC = RandomForestClassifier()

In [56]:
RFC.fit(X_train_bd, y_train_bd)

In [58]:
y_prediction_bd = RFC.predict(X_test_bd)

In [60]:
print(classification_report(y_test_bd, y_prediction_bd))

              precision    recall  f1-score   support

           0       0.83      0.78      0.81      2316
           1       0.85      0.89      0.87      3253

    accuracy                           0.84      5569
   macro avg       0.84      0.83      0.84      5569
weighted avg       0.84      0.84      0.84      5569



In [62]:
cm_bd = confusion_matrix(y_test_bd, y_prediction_bd)

print("Confusion Matrix (raw array):")
print(cm_bd)

Confusion Matrix (raw array):
[[1804  512]
 [ 360 2893]]


## Grid Search

In [3]:
df_griddy = pd.read_csv("Modeling_4-23.csv")

In [5]:
ohe = OneHotEncoder()

In [7]:
df_griddy= pd.get_dummies(df_griddy, columns=['Degree'], dtype=int)

In [9]:
X = df_griddy.loc[:, df_griddy.columns!='Depression']
y = df_griddy['Depression']

In [11]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
RFC = RandomForestClassifier()

In [15]:
params = {
    'n_estimators': [100, 300],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt', 0.8],
    'bootstrap': [True],
    'class_weight': ['balanced'],
    'criterion': ['gini', 'entropy']
}
 
grid_search = GridSearchCV(
    estimator=RFC,
    param_grid=params,
    cv=5,
    scoring='recall',   
    n_jobs=-1,
    verbose=2
)

In [30]:
grid_search.fit(X_train, y_train)

print("\nBest Parameters Found: ", grid_search.best_params_)
print("\nBest Cross-Validation Accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits

Best Parameters Found:  {'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best Cross-Validation Accuracy:  0.883972392638037


In [17]:
RFC_Griddy = RandomForestClassifier(
    bootstrap=True,
    class_weight='balanced',
    criterion='gini',
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)

In [19]:
RFC_Griddy.fit(X_train, y_train)

In [21]:
y_prediction = RFC_Griddy.predict(X_test)

In [23]:
print(classification_report(y_test, y_prediction))

cm = confusion_matrix(y_test, y_prediction)

print("Confusion Matrix (raw array):")
print(cm)

              precision    recall  f1-score   support

           0       0.84      0.77      0.80      2316
           1       0.85      0.89      0.87      3253

    accuracy                           0.84      5569
   macro avg       0.84      0.83      0.84      5569
weighted avg       0.84      0.84      0.84      5569

Confusion Matrix (raw array):
[[1790  526]
 [ 349 2904]]


In [33]:
pipeline_template = Pipeline([
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),
    ('RFC_Griddy',RandomForestClassifier(
    bootstrap=True,
    class_weight='balanced',
    criterion='gini',
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
    ))
])

In [43]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_recall = 0
best_model = None
fold_recalls = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
    print(f"Training fold {fold}...")

    X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]

    # Clone a fresh pipeline for each fold
    pipeline = clone(pipeline_template)

    pipeline.fit(X_train, y_train)
    y_prediction = pipeline.predict(X_val)
    recall = recall_score(y_val, y_prediction)
    fold_recalls.append(recall)

    print(f"Fold {fold} Recall: {recall:.4f}")

    # Save the model if it's the best so far
    if recall > best_recall:
        best_recall = recall
        best_model = pipeline

Training fold 1...
Fold 1 Recall: 0.8570
Training fold 2...
Fold 2 Recall: 0.8622
Training fold 3...
Fold 3 Recall: 0.8579
Training fold 4...
Fold 4 Recall: 0.8689
Training fold 5...
Fold 5 Recall: 0.8643


In [53]:
print("\nCross-Validation Recalls:", fold_recalls)
print("Mean Recall:", np.mean(fold_recalls))
print("Best Recall:", best_recall)

# (Optional) Save the best model
joblib.dump(best_model, "best_xgb_model.pkl")
print("Best model saved as 'best_xgb_model.pkl'")


Cross-Validation Recalls: [0.8570113531758208, 0.8622276772015955, 0.8579318809450752, 0.8689379987722529, 0.8643339472068754]
Mean Recall: 0.862088571460324
Best Recall: 0.8689379987722529
Best model saved as 'best_xgb_model.pkl'


In [59]:
# Use the best model from cross-validation to predict on the holdout test set
y_test_pred = best_model.predict(X_test)

# Evaluate properly
from sklearn.metrics import classification_report, confusion_matrix

print("\nClassification Report on Holdout Test Set:")
print(classification_report(y_test, y_test_pred))

cm = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix (raw array):")
print(cm)


Classification Report on Holdout Test Set:
              precision    recall  f1-score   support

           0       0.97      0.96      0.96      2316
           1       0.97      0.98      0.97      3253

    accuracy                           0.97      5569
   macro avg       0.97      0.97      0.97      5569
weighted avg       0.97      0.97      0.97      5569

Confusion Matrix (raw array):
[[2215  101]
 [  70 3183]]


In [61]:
print("X shape:", X.shape)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

X shape: (27845, 56)
X_train shape: (22276, 56)
X_test shape: (5569, 56)


In [63]:
best_model