# Library and other import

In [17]:
from controller import fetchAll

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,recall_score,precision_score

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping # Import EarlyStopping
from sklearn.utils import class_weight as sk_class_weight # Để tránh nhầm tên

# Load Data

In [40]:
data = fetchAll()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   id                             2556 non-null   int64 
 1   age                            2556 non-null   int64 
 2   depression                     2556 non-null   bool  
 3   dietary_habits                 2556 non-null   object
 4   family_mental_illness_history  2556 non-null   bool  
 5   financial_stress               2556 non-null   int64 
 6   gender                         2556 non-null   object
 7   sleep_duration                 2556 non-null   object
 8   suicidal_thoughts              2556 non-null   bool  
 9   work_hours                     2556 non-null   int64 
 10  work_pressure                  2556 non-null   int64 
 11  work_satisfaction              2556 non-null   int64 
 12  from_source                    2556 non-null   object
dtypes: 

# Transform

Category Data

In [41]:
data = pd.get_dummies(data, columns=['dietary_habits', 'gender', 'sleep_duration','from_source'], drop_first=False)
data.columns = data.columns.str.replace(' ', '_')

Numerical Data

In [42]:
scaler = StandardScaler()
numerical_cols = ['age','financial_stress','work_hours','work_pressure','work_satisfaction']

data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

data.drop(['id'], axis=1, inplace=True)

In [43]:
data.head()

Unnamed: 0,age,depression,family_mental_illness_history,financial_stress,suicidal_thoughts,work_hours,work_pressure,work_satisfaction,dietary_habits_Healthy,dietary_habits_Moderate,dietary_habits_Unhealthy,gender_Female,gender_Male,sleep_duration_5-6_hours,sleep_duration_7-8_hours,sleep_duration_Less_than_5_hours,sleep_duration_More_than_8_hours,from_source_Professional,from_source_Student
0,-0.900869,False,True,-0.684473,True,0.789215,-0.721512,0.690376,False,True,False,False,True,False,True,False,False,False,True
1,-0.900869,False,True,-1.391062,True,0.258853,0.695457,1.399907,True,False,False,False,True,True,False,False,False,False,True
2,-1.145604,True,False,0.728704,True,1.054397,-1.429997,-0.019154,False,False,True,False,True,True,False,False,False,False,True
3,-1.30876,False,True,-0.684473,True,0.258853,-1.429997,0.690376,False,False,True,False,True,False,False,False,True,False,True
4,-0.656135,False,True,-0.684473,True,-0.536691,-1.429997,1.399907,True,False,False,True,False,False,False,False,True,False,True


# Model

In [44]:
X = data.drop('depression', axis=1)
y= data['depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## Utils

In [45]:
def getResult(y_test,y_pred):
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  print('Accuracy:',accuracy)
  print('Precision:',precision)
  print('Recall:', recall)
  print('F1 score:',f1)
  print('Confusion matrix:')
  print(confusion_matrix(y_test, y_pred))
  return [accuracy,precision,recall,f1]

## Logistic Regression

In [46]:
lr = LogisticRegression(random_state=0, class_weight='balanced', max_iter=1000)
lr.fit(X_train,y_train)
y_pred=lr.predict(X_test)

In [47]:
lr_result = ['Logistic Regression'] + getResult(y_test,y_pred)

Accuracy: 0.9778357235984355
Precision: 0.8924050632911392
Recall: 1.0
F1 score: 0.9431438127090301
Confusion matrix:
[[609  17]
 [  0 141]]


## Naive Bayes

In [48]:
NB = GaussianNB()
NB.fit(X_train, y_train)
y_pred = NB.predict(X_test)

In [49]:
nb_result = ['Naive Bayes'] + getResult(y_test,y_pred)

Accuracy: 0.81877444589309
Precision: 0.5061728395061729
Recall: 0.5815602836879432
F1 score: 0.5412541254125413
Confusion matrix:
[[546  80]
 [ 59  82]]


## Decision Tree

In [50]:
dstree = DecisionTreeClassifier(criterion='gini', max_depth=10, min_samples_split=2, random_state=42, class_weight='balanced')
dstree.fit(X_train, y_train)
y_pred = dstree.predict(X_test)

In [51]:
dstree_result = ['Decision Tree'] + getResult(y_test,y_pred)

Accuracy: 0.9126466753585397
Precision: 0.7890625
Recall: 0.7163120567375887
F1 score: 0.7509293680297398
Confusion matrix:
[[599  27]
 [ 40 101]]


## Random Forest

In [52]:
rf = RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=2, n_estimators = 10, random_state=42,  class_weight='balanced')
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

In [53]:
rf_result = ['Random Forest'] + getResult(y_test,y_pred)

Accuracy: 0.9113428943937418
Precision: 0.8924731182795699
Recall: 0.5886524822695035
F1 score: 0.7094017094017094
Confusion matrix:
[[616  10]
 [ 58  83]]


## XGBoost

In [54]:
neg_count = np.sum(y_train == 0)
pos_count = np.sum(y_train == 1)
if pos_count > 0:
    scale_pos_weight_value = neg_count / pos_count
    print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.2f}")
else:
    scale_pos_weight_value = 1 # Giá trị mặc định nếu không có lớp dương
    print("Warning: No positive samples found in y_train. Setting scale_pos_weight to 1.")

Calculated scale_pos_weight: 4.70


In [55]:
xgb = XGBClassifier(random_state=42,
                    scale_pos_weight=scale_pos_weight_value,
                    n_estimators=100, # Có thể tăng/giảm để thử nghiệm
                    max_depth=5,      # Có thể thay đổi
                    # use_label_encoder=False,
                    eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)

In [56]:
xgb_result = ['XGBoost'] + getResult(y_test, y_pred_xgb)

Accuracy: 0.9621903520208605
Precision: 0.8943661971830986
Recall: 0.900709219858156
F1 score: 0.8975265017667845
Confusion matrix:
[[611  15]
 [ 14 127]]


## MLP

In [88]:
numerical_cols_mlp = ['age','financial_stress','work_hours','work_pressure','work_satisfaction'] # Kiểm tra lại
categorical_cols_mlp = [col for col in data.columns if col not in numerical_cols_mlp and col != 'depression']

data_mlp = data.copy()

# Tách X và y cho MLP
X_mlp_df = data_mlp.drop('depression', axis=1) # Giữ dạng DataFrame
y_mlp = data_mlp['depression']

# Tách train/test cho MLP (sử dụng cùng random_state và stratify)
X_train_mlp_df, X_test_mlp_df, y_train_mlp, y_test_mlp = train_test_split(X_mlp_df, y_mlp, test_size=0.30, random_state=42, stratify=y_mlp)

# Định dạng lại X thành danh sách các input cho Keras
X_train_list = [X_train_mlp_df[numerical_cols_mlp].values]
X_test_list = [X_test_mlp_df[numerical_cols_mlp].values]
for col in categorical_cols_mlp:
    X_train_list.append(X_train_mlp_df[col].values)
    X_test_list.append(X_test_mlp_df[col].values)

# Tính class_weight cho Keras
class_weights_mlp = sk_class_weight.compute_class_weight(
    'balanced', classes=np.unique(y_train_mlp), y=y_train_mlp
)
class_weight_dict_mlp = dict(enumerate(class_weights_mlp))
print(f"Calculated class weights for Keras: {class_weight_dict_mlp}")

# Xây dựng mô hình MLP (Giữ kiến trúc cơ bản, thêm Dropout)
input_layers = []
embedding_layers = []
input_numeric = layers.Input(shape=(len(numerical_cols_mlp),), name='input_numeric')
input_layers.append(input_numeric)
embedding_layers.append(input_numeric)

for col in categorical_cols_mlp:
    num_unique_values = data_mlp[col].nunique() + 1
    embedding_dim = min(50, int(num_unique_values / 2))
    input_cat = layers.Input(shape=(1,), name=f'input_{col}')
    input_layers.append(input_cat)
    embedding = layers.Embedding(input_dim=num_unique_values, output_dim=embedding_dim, name=f'embedding_{col}')(input_cat)
    embedding = layers.Flatten(name=f'flatten_{col}')(embedding)
    embedding_layers.append(embedding)

merged_inputs = layers.concatenate(embedding_layers, name='concatenate_embeddings')
dense1 = layers.Dense(128, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001), name='dense_1')(merged_inputs) # Thêm L2 reg
bn1 = layers.BatchNormalization(name='batchnorm_1')(dense1)
dropout1 = layers.Dropout(0.4, name='dropout_1')(bn1) # Tăng nhẹ Dropout
dense2 = layers.Dense(64, activation='relu', kernel_regularizer=keras.regularizers.l2(0.001), name='dense_2')(dropout1) # Thêm L2 reg
bn2 = layers.BatchNormalization(name='batchnorm_2')(dense2)
dropout2 = layers.Dropout(0.4, name='dropout_2')(bn2) # Tăng nhẹ Dropout
output_layer = layers.Dense(1, activation='sigmoid', name='output')(dropout2)
model_mlp = keras.Model(inputs=input_layers, outputs=output_layer)
model_mlp.summary()

# Biên dịch mô hình
model_mlp.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), # Giữ Adam, lr mặc định
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.Precision(name='precision'), tf.keras.metrics.Recall(name='recall')])

# ----- BỔ SUNG: Early Stopping -----
early_stopping = EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
    start_from_epoch=0,
)
# ----------------------------------

# Huấn luyện mô hình với Early Stopping
EPOCHS = 1000 # Tăng số epochs vì có EarlyStopping
BATCH_SIZE = 64
history = model_mlp.fit(
    X_train_list,
    y_train_mlp,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.2,
    class_weight=class_weight_dict_mlp,
    callbacks=[early_stopping], # Thêm callback EarlyStopping
    verbose=1
)


Calculated class weights for Keras: {0: np.float64(0.6080897348742352), 1: np.float64(2.8128930817610063)}


Epoch 1/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.5792 - loss: 0.8044 - precision: 0.2453 - recall: 0.7497 - val_accuracy: 0.8101 - val_loss: 0.7034 - val_precision: 0.5000 - val_recall: 0.8971
Epoch 2/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7024 - loss: 0.5818 - precision: 0.3600 - recall: 0.9537 - val_accuracy: 0.8743 - val_loss: 0.6309 - val_precision: 0.6237 - val_recall: 0.8529
Epoch 3/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7247 - loss: 0.5627 - precision: 0.3654 - recall: 0.9289 - val_accuracy: 0.8994 - val_loss: 0.5852 - val_precision: 0.7105 - val_recall: 0.7941
Epoch 4/1000
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7921 - loss: 0.4994 - precision: 0.4583 - recall: 0.9431 - val_accuracy: 0.9078 - val_loss: 0.5348 - val_precision: 0.7612 - val_recall: 0.7500
Epoch 5/1000
[1m23/23

In [None]:
y_pred_proba_mlp = model_mlp.predict(X_test_list)
y_pred_mlp = (y_pred_proba_mlp > 0.5).astype(int)
# print(y_pred_mlp)
mlp_result = ['MLP (Keras, ES)'] + getResult(y_test_mlp, y_pred_mlp)

[1m56/56[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Accuracy: 0.9972051425377306
Precision: 0.9905956112852664
Recall: 0.9937106918238994
F1 score: 0.9921507064364207
Confusion matrix:
[[1468    3]
 [   2  316]]


# Summary

In [64]:
all_results_data = [
    lr_result,
    nb_result,
    dstree_result,
    rf_result,
    xgb_result,     # Kết quả XGBoost đã cân bằng
    mlp_result,              # Kết quả MLP với Early Stopping
]

final_results = pd.DataFrame(data=all_results_data,
                             columns=['Model','Accuracy','Precision','Recall','F1'])

print("\n--- Final Comparison Results (Updated) ---")
display(final_results)


--- Final Comparison Results (Updated) ---


Unnamed: 0,Model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.977836,0.892405,1.0,0.943144
1,Naive Bayes,0.818774,0.506173,0.58156,0.541254
2,Decision Tree,0.912647,0.789062,0.716312,0.750929
3,Random Forest,0.911343,0.892473,0.588652,0.709402
4,XGBoost,0.96219,0.894366,0.900709,0.897527
5,"MLP (Keras, ES)",0.777053,0.083333,0.021277,0.033898
