In [32]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff
import warnings
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

import xgboost as xgb
import lightgbm as lgb

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

import shap
import lime
from lime import lime_tabular

warnings.filterwarnings("ignore", category=FutureWarning)

In [19]:
df = pd.read_csv('../Data/Cleaned_dataset.csv')

### Train-test split | Normalizing | initial benchmark training

In [34]:
X = df.drop("Depression", axis=1).values
y = df["Depression"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
with open('../Model/scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

score = model.score(X_test_scaled, y_test)
print(f"Accuracy: {score*100:.2f}%")

Accuracy: 84.61%


In [21]:
y_pred = model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)

labels = ["Safe", "Depressed"]

fig = ff.create_annotated_heatmap(
    z=cm,
    x=labels,
    y=labels,
    colorscale="Viridis", 
    showscale=True,
)

fig.update_layout(
    title="Confusion Matrix",
    xaxis_title="Predicted Values",
    yaxis_title="True Values",
    xaxis=dict(tickmode="array", tickvals=[0, 1], ticktext=labels),
    yaxis=dict(tickmode="array", tickvals=[0, 1], ticktext=labels),
)

# Show the figure
fig.show()

In [22]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "XGBoost": xgb.XGBClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42, verbose=-1)
}

accuracy_results = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)  
    y_pred = model.predict(X_test_scaled) 
    accuracy = accuracy_score(y_test, y_pred)  
    accuracy_results[name] = accuracy 

accuracy_results_ordered = dict(sorted(accuracy_results.items(), key=lambda item: item[1], reverse=True))

print("Model Accuracies:")
for model_name, accuracy in accuracy_results_ordered.items():
    print(f"{model_name}: {accuracy:.4f}")

fig = go.Figure()

fig.add_trace(go.Bar(
    y=list(accuracy_results_ordered.keys()),
    x=list(accuracy_results_ordered.values()),
    orientation='h',
    marker=dict(
        color=[i for i in range(len(accuracy_results_ordered))],
        colorscale='Plasma',  
        showscale=False
    ),
    text=[f'{v*100:.2f}%' for v in accuracy_results_ordered.values()],
    textposition='outside'
))

fig.update_layout(
    title='Model Accuracy Comparison',
    xaxis_title='Accuracy',
    yaxis_title='Model',
    height=600,
    bargap=0.2,
    bargroupgap=0.1,
    yaxis={'categoryorder':'total ascending'}
)

fig.show()

Model Accuracies:
Gradient Boosting: 0.8495
SVM: 0.8481
LightGBM: 0.8479
Logistic Regression: 0.8461
Random Forest: 0.8430
XGBoost: 0.8419
Naive Bayes: 0.8378
K-Nearest Neighbors: 0.8253
Decision Tree: 0.7692


#### As we can see, Gradient boosting was the winner here. Nevertheless we now try to utilise neural networks as well to see if there is any improvement

In [23]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
rf_score = rf_model.score(X_test_scaled, y_test)

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)
xgb_score = xgb_model.score(X_test_scaled, y_test)

# Neural Network with Keras
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_train)), activation='softmax')
])

nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_scaled, y_train_encoded, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
nn_score = nn_model.evaluate(X_test_scaled, y_test_encoded, verbose=0)[1]

# Print Accuracy Scores
print(f"Random Forest Accuracy: {rf_score:.4f}")
print(f"XGBoost Accuracy: {xgb_score:.4f}")
print(f"Neural Network Accuracy: {nn_score:.4f}")




Parameters: { "use_label_encoder" } are not used.



Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



Random Forest Accuracy: 0.8430
XGBoost Accuracy: 0.8419
Neural Network Accuracy: 0.8465


In [30]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import label_binarize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)

nn_model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_train)), activation='softmax')
])

nn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train_scaled, y_train_encoded, epochs=50, batch_size=32, validation_split=0.2, verbose=0)

y_pred = nn_model.predict(X_test_scaled)
y_pred_classes = np.argmax(y_pred, axis=1)

accuracy = accuracy_score(y_test, y_pred_classes)
precision = precision_score(y_test, y_pred_classes, average='weighted')
recall = recall_score(y_test, y_pred_classes, average='weighted')
f1 = f1_score(y_test, y_pred_classes, average='weighted')

y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
if y_test_binarized.shape[1] == 1:
    y_test_binarized = np.hstack((1-y_test_binarized, y_test_binarized))

roc_auc_values = []
for i in range(len(np.unique(y_test))):
    try:
        roc_auc = roc_auc_score(y_test_binarized[:, i], y_pred[:, i])
        roc_auc_values.append(roc_auc)
    except ValueError:
        roc_auc_values.append(np.nan)

roc_auc = np.nanmean(roc_auc_values) if np.any(np.isfinite(roc_auc_values)) else np.nan

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")
print(f"ROC-AUC: {roc_auc:.4f}")

nn_model.save('../Model/mental_health_model.h5')
print("Model saved as ../Model/mental_health_model.h5")


Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead.



[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  




Accuracy: 0.8481
Precision: 0.8471
Recall: 0.8481
F1-score: 0.8455
ROC-AUC: 0.9138
Model saved as ../Model/mental_health_model.h5


#### Neural Network accuracy came same as the gradient boosting one. So we will take into account the Neural Network with a minute jump in accuracy.