In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.base import clone
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import recall_score, accuracy_score
import numpy as np
import joblib
from sklearn.model_selection import cross_val_score, StratifiedKFold
from functions_pt2 import get_plots, val_count, drop_rows_by_values, swap_columns, encode_labeling, plot_against_target


In [3]:
df = pd.read_csv("clean_file.csv")

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [7]:
df['Financial Stress'].unique()

array([1, 2, 5, 3, 4])

In [9]:
df = drop_rows_by_values(df, 'Financial Stress', ['?'])

## Feature Engineering

In [11]:
city_to_state = {
    'Agra': 'Uttar Pradesh',                
    'Ahmedabad': 'Gujarat',
    'Bangalore': 'Karnataka',
    'Bhopal': 'Madhya Pradesh',
    'Chennai': 'Tamil Nadu',
    'Delhi': 'Delhi',
    'Faridabad': 'Haryana',
    'Ghaziabad': 'Uttar Pradesh',
    'Hyderabad': 'Telangana',               
    'Indore': 'Madhya Pradesh',
    'Jaipur': 'Rajasthan',
    'Kalyan': 'Maharashtra',
    'Kanpur': 'Uttar Pradesh',              
    'Kolkata': 'West Bengal',
    'Lucknow': 'Uttar Pradesh',             
    'Ludhiana': 'Punjab',                   
    'Meerut': 'Uttar Pradesh',
    'Mumbai': 'Maharashtra',
    'Nagpur': 'Maharashtra',
    'Nashik': 'Maharashtra',
    'Patna': 'Bihar',
    'Pune': 'Maharashtra',
    'Rajkot': 'Gujarat',
    'Srinagar': 'Jammu and Kashmir',        
    'Surat': 'Gujarat',
    'Thane': 'Maharashtra',                 
    'Vadodara': 'Gujarat',
    'Varanasi': 'Uttar Pradesh',
    'Vasai-Virar': 'Maharashtra',           
    'Visakhapatnam': 'Andhra Pradesh',
}

## Creating a new column called State
df['State'] = df['City'].map(city_to_state)

In [13]:
df = drop_rows_by_values(df, 'City', ["Saanvi", "Bhavna", "Harsha", "Rashi", "Nandini", "Nalini", "Mihir", "Nalyan", 
                                      "Mira", "Kibara", "Reyansh", "Harsh", "Gaurav", "Vaanya"])

In [11]:
df = df.drop(columns="City")

In [35]:
df['Has_Job'] = ((df['Work Pressure'] > 0) |
                         (df['Job Satisfaction'] != 0)).astype(int)


In [37]:
df['Work_Stress'] = 0  # Default to 0
working_mask = df['Has_Job'] == 1
df.loc[working_mask, 'Work_Stress'] = (
    df.loc[working_mask, 'Work Pressure'] -
    df.loc[working_mask, 'Job Satisfaction']
)


In [15]:
degree_map = {
    # Pre-college
    'Class 12': 1,

    # Bachelor's Degrees
    'B.Ed': 2,
    'B.Com': 2,
    'BCA': 2,
    'B.Arch': 2,
    'B.Tech': 2,
    'BHM': 2,
    'B.Pharm': 2,
    'BSc': 2,
    'BBA': 2,
    'BA': 2,
    'BE': 2,
    'LLB': 2,
    'MBBS': 2,  # Bachelor's in Medicine

    # Master's Degrees
    'MSc': 3,
    'M.Tech': 3,
    'MCA': 3,
    'M.Ed': 3,
    'M.Com': 3,
    'MA': 3,
    'MBA': 3,
    'ME': 3,
    'M.Pharm': 3,
    'MHM': 3,
    'LLM': 3,
    'MD': 3,  # Usually considered postgrad specialization in medicine

    # Doctoral
    'PhD': 4
}

df['Degree_Level'] = df['Degree'].map(degree_map)

In [39]:
#Feature engineering
df['Age_Education_Gap'] = df['Age'] / df['Degree_Level']
df['Rest_Balance'] = df['Sleep_Score'] - df['Work/Study Hours']
df['Stress_Load'] = df['Academic Pressure'] + df['Financial Stress']
df['Wellness_Score'] = df['Diet_Score'] + df['Sleep_Score']

In [45]:
pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,Age,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Degree_Level,Gender_Female,Gender_Male,State_Andhra Pradesh,State_Bihar,State_Delhi,State_Gujarat,State_Haryana,State_Jammu and Kashmir,State_Karnataka,State_Madhya Pradesh,State_Maharashtra,State_Punjab,State_Rajasthan,State_Tamil Nadu,State_Telangana,State_Uttar Pradesh,State_West Bengal,Suicidal_Thoughts,Sleep_Score,Degree_'Class 12',Degree_B.Arch,Degree_B.Com,Degree_B.Ed,Degree_B.Pharm,Degree_B.Tech,Degree_BA,Degree_BBA,Degree_BCA,Degree_BE,Degree_BHM,Degree_BSc,Degree_LLB,Degree_LLM,Degree_M.Com,Degree_M.Ed,Degree_M.Pharm,Degree_M.Tech,Degree_MA,Degree_MBA,Degree_MBBS,Degree_MCA,Degree_MD,Degree_ME,Degree_MHM,Degree_MSc,Degree_PhD,Diet_Score,Has_Job,Work_Stress,Age_Education_Gap,Rest_Balance,Stress_Load,Wellness_Score
0,33.0,5.0,0.0,8.97,2.0,0.0,3.0,1,0,1,2.0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,16.5,-2.0,6.0,2
1,24.0,2.0,0.0,5.9,5.0,0.0,3.0,2,1,0,2.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,12.0,-2.0,4.0,1
2,31.0,3.0,0.0,7.03,5.0,0.0,9.0,1,1,0,2.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,15.5,-9.0,4.0,1
3,28.0,3.0,0.0,5.59,2.0,0.0,4.0,5,1,1,2.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,14.0,-2.0,8.0,2
4,25.0,4.0,0.0,8.13,3.0,0.0,1.0,1,0,0,3.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,8.333333,0.0,5.0,1


## Encoding

In [19]:
ohe = OneHotEncoder()

In [21]:
df = pd.get_dummies(df, columns=['Gender'], dtype=int)

In [23]:
df = pd.get_dummies(df, columns=['State'], dtype=int)

In [25]:
yesno = {
    'Yes': 1,
    'No': 0
}

df['Suicidal_Thoughts'] = df['Have you ever had suicidal thoughts ?'].map(yesno)
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map(yesno)


In [43]:
# Drop irrelevant columns
df = df.drop(columns=['id', "Profession", "Sleep Duration","Dietary Habits","Have you ever had suicidal thoughts ?","City"])  # Drop identifiers & geodata

In [27]:
sleep_map = {
    "'7-8 hours'": 2,
    "'5-6 hours'": 1,
    "'Less than 5 hours'": 0,
    "'More than 8 hours'": 0
}

df['Sleep_Score'] = df['Sleep Duration'].map(sleep_map)

In [29]:
df = pd.get_dummies(df, columns=['Degree'], dtype=int)

In [31]:
diet_map = {
    'Healthy': 1,
    'Moderate': 0,
    'Unhealthy': -1
}

df['Diet_Score'] = df['Dietary Habits'].map(diet_map)


## Splitting Data

In [47]:
X = df.loc[:, df.columns!='Depression']
y = df['Depression']

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=42)

## Grid Search (Recall)

In [49]:
RFC = RandomForestClassifier()

params = {
    'n_estimators': [100, 300],
    'max_depth': [None, 20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 4],
    'max_features': ['sqrt', 0.8],
    'bootstrap': [True],
    'class_weight': ['balanced'],
    'criterion': ['gini', 'entropy']
}
 
grid_search = GridSearchCV(
    estimator=RFC,
    param_grid=params,
    cv=5,
    scoring='recall',   
    n_jobs=-1,
    verbose=2
)

In [51]:
grid_search.fit(X_train, y_train)

print("\nBest Parameters Found: ", grid_search.best_params_)
print("\nBest Cross-Validation Accuracy: ", grid_search.best_score_)

Fitting 5 folds for each of 96 candidates, totalling 480 fits

Best Parameters Found:  {'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}

Best Cross-Validation Accuracy:  0.8831598409910326


In [53]:
best_RF = grid_search.best_estimator_

y_pred_best_RF = best_RF.predict(X_test)

In [55]:
print(classification_report(y_test, y_pred_best_RF))

cm = confusion_matrix(y_test,  y_pred_best_RF)

print("Confusion Matrix (raw array):")
print(cm)

              precision    recall  f1-score   support

           0       0.83      0.78      0.80      2304
           1       0.85      0.88      0.87      3273

    accuracy                           0.84      5577
   macro avg       0.84      0.83      0.84      5577
weighted avg       0.84      0.84      0.84      5577

Confusion Matrix (raw array):
[[1803  501]
 [ 378 2895]]


## Cross Validation (Recall)

In [61]:
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=42)

best_RF_cv = cross_val_score(best_RF, X_train, y_train, cv=cv, scoring='recall')
print(f"Fold Metrics:\n{best_RF_cv}\n")
print(f"Average Score:\n{best_RF_cv.mean()}\n")

Fold Metrics:
[0.87745098 0.88970588 0.87071078 0.88228081 0.88044145 0.88534641
 0.87553648 0.88357843]

Average Score:
0.8806314035807457



In [59]:
from sklearn.metrics import recall_score, confusion_matrix
from sklearn.model_selection import StratifiedKFold
import numpy as np

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_recall = 0
best_confusion_matrix = None
fold_recalls = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    model = clone(best_RF)
    model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_val)
    recall = recall_score(y_val, y_pred)
    cm = confusion_matrix(y_val, y_pred)
    
    fold_recalls.append(recall)
    
    print(f"Fold {fold} Recall: {recall:.4f}")
    print(f"Confusion Matrix:\n{cm}\n")
    
    if recall > best_recall:
        best_recall = recall
        best_confusion_matrix = cm

print(f"All Fold Recalls: {fold_recalls}")
print(f"Mean Recall: {np.mean(fold_recalls):.4f}")
print(f"Best Fold Recall: {best_recall:.4f}")
print(f"Best Confusion Matrix:\n{best_confusion_matrix}")

Fold 1 Recall: 0.8855
Confusion Matrix:
[[1460  391]
 [ 299 2312]]

Fold 2 Recall: 0.8675
Confusion Matrix:
[[1457  394]
 [ 346 2265]]

Fold 3 Recall: 0.8893
Confusion Matrix:
[[1443  408]
 [ 289 2321]]

Fold 4 Recall: 0.8789
Confusion Matrix:
[[1441  410]
 [ 316 2294]]

Fold 5 Recall: 0.8858
Confusion Matrix:
[[1423  428]
 [ 298 2312]]

All Fold Recalls: [0.8854844887016469, 0.8674837227116048, 0.889272030651341, 0.878927203065134, 0.885823754789272]
Mean Recall: 0.8814
Best Fold Recall: 0.8893
Best Confusion Matrix:
[[1443  408]
 [ 289 2321]]
