In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
# load the dataset for model training
df = pd.read_csv('../data/processed/Mental_Health_Dataset_Encoded.csv')
df.head()

Unnamed: 0,Gender,Country,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,...,Work_Interest,care_options,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student,Social_Weakness_No,Social_Weakness_Yes,mental_health_interview_No,mental_health_interview_Yes
0,0,United States,0,0,1,1,2,0,2,1,...,0,1,1,0,0,0,0,1,1,0
1,0,United States,0,1,1,1,2,0,2,1,...,0,0,1,0,0,0,0,1,1,0
2,0,United States,0,1,1,1,2,0,2,1,...,0,2,1,0,0,0,0,1,1,0
3,0,United States,0,1,1,1,2,0,2,1,...,0,2,1,0,0,0,0,1,0,0
4,0,United States,0,1,1,1,2,0,2,1,...,0,2,1,0,0,0,0,1,1,0


In [None]:
# Try using all features at the start and we'll trim the ones that don't matter afterwards in the next iteration
# # The target variable is 'treatment' which indicates whether the individual received treatment or not.
features = df.drop(columns=['treatment', 'Country']).columns
X = df[features]
y = df['treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=414) 

# Tuning Decsision Tree Parameters
dt = DecisionTreeClassifier(
    max_depth=10, 
    min_samples_split=15,  
    min_samples_leaf=30,   
    max_features='sqrt',   
    random_state=414,     
)

dt.fit(X_train, y_train)

# See how well the model performs
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("DecisionTree Classification Report:")
print(classification_report(y_test, y_pred))

print(f"\nAUC-ROC: {roc_auc_score(y_test, y_prob):.2f}") # slightly worse than logistic regression when including all features

DecisionTree Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.70      0.69     14565
           1       0.70      0.68      0.69     14672

    accuracy                           0.69     29237
   macro avg       0.69      0.69      0.69     29237
weighted avg       0.69      0.69      0.69     29237


AUC-ROC: 0.75


In [32]:
importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': dt.feature_importances_
}).sort_values('Importance', ascending=False)
print(importance) # It seems like the most important features are the first 4 features

                        Feature  Importance
2                family_history    0.493969
10                 care_options    0.291519
0                        Gender    0.151046
17   mental_health_interview_No    0.042082
18  mental_health_interview_Yes    0.012733
1                 self_employed    0.006745
6         Mental_Health_History    0.000273
3                  Days_Indoors    0.000226
7                   Mood_Swings    0.000214
8              Coping_Struggles    0.000197
4                Growing_Stress    0.000196
9                 Work_Interest    0.000158
5                Changes_Habits    0.000148
13            Occupation_Others    0.000106
12         Occupation_Housewife    0.000105
15           Social_Weakness_No    0.000084
16          Social_Weakness_Yes    0.000083
11         Occupation_Corporate    0.000059
14           Occupation_Student    0.000056


In [33]:
# this time, calibrate the model to only use the most important features
important_features = importance['Feature'].head(4).tolist()
X = df[important_features]
y = df['treatment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=414)

dt = DecisionTreeClassifier(
    max_depth=10,        # Limit tree depth
    min_samples_split=10,  
    min_samples_leaf=4,   
    max_features='sqrt', 
    random_state=414,     
)

# fit the model and see how well it performs
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
y_prob = dt.predict_proba(X_test)[:,1]

print("DecisionTree Classification Report:")
print(classification_report(y_test, y_pred))

print(f"\nAUC-ROC: {roc_auc_score(y_test, y_prob):.2f}")

DecisionTree Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.66      0.69     14565
           1       0.69      0.77      0.73     14672

    accuracy                           0.71     29237
   macro avg       0.72      0.71      0.71     29237
weighted avg       0.71      0.71      0.71     29237


AUC-ROC: 0.77


In [None]:
# See if random forest performs better
dt2 = RandomForestClassifier(
    n_estimators=1000,  # Number of trees in the forest
    max_depth=10, 
    min_samples_split=10,  
    min_samples_leaf=4,   
    max_features='sqrt', 
    random_state=414,     
)
dt2.fit(X_train, y_train)
y_pred2 = dt2.predict(X_test)
y_prob2 = dt2.predict_proba(X_test)[:,1]
print("RandomForest Classification Report:")
print(classification_report(y_test, y_pred2))
print(f"\nAUC-ROC: {roc_auc_score(y_test, y_prob2):.2f}")

RandomForest Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.66      0.69     14565
           1       0.69      0.77      0.73     14672

    accuracy                           0.71     29237
   macro avg       0.72      0.71      0.71     29237
weighted avg       0.71      0.71      0.71     29237


AUC-ROC: 0.77


In [35]:
# See it the gradient boosting performs better
dt3 = GradientBoostingClassifier(
    n_estimators=1000,  # Number of trees in the forest
    max_depth=10,      # Limit tree depth
    min_samples_split=10,  
    min_samples_leaf=4,   
    max_features='sqrt', 
    random_state=414,
    learning_rate=0.05,  # Learning rate
    subsample=0.8,      # Fraction of samples to use for fitting the trees     
)
dt3.fit(X_train, y_train)
y_pred = dt3.predict(X_test)
y_prob = dt3.predict_proba(X_test)[:,1]
print("GradientBoost Classification Report:")
print(classification_report(y_test, y_pred))
print(f"\nAUC-ROC: {roc_auc_score(y_test, y_prob):.2f}")


GradientBoost Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.66      0.69     14565
           1       0.69      0.77      0.73     14672

    accuracy                           0.71     29237
   macro avg       0.72      0.71      0.71     29237
weighted avg       0.71      0.71      0.71     29237


AUC-ROC: 0.77
