In [1]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Lasso
from sklearn.metrics import root_mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import mlflow

In [2]:
# Set up MLflow tracking
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mental-health-experiment")

2025/07/13 19:05:27 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/13 19:05:27 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/home/binks/ML_Mental_Health_Project/experiment_tracking/mlruns/2', creation_time=1752352793130, experiment_id='2', last_update_time=1752352793130, lifecycle_stage='active', name='mental-health-experiment', tags={}>

In [3]:
# Data cleaning function
def clean_data(df):
    # Fill missing values for 'Severity' and 'Consultation_History' with 'Unknown'
    df['Severity'] = df['Severity'].fillna('Unknown')
    df['Consultation_History'] = df['Consultation_History'].fillna('Unknown')

    # Handle missing 'Stress_Level' by filling with 'Unknown'
    df['Stress_Level'] = df['Stress_Level'].fillna('Unknown')

    # Convert categorical columns to string types
    categorical_columns = ['Gender', 'Occupation', 'Country', 'Mental_Health_Condition', 'Severity', 'Consultation_History', 'Stress_Level']
    df[categorical_columns] = df[categorical_columns].astype(str)

    # Convert categorical columns to numerical using LabelEncoder
    le = LabelEncoder()
    for col in categorical_columns:
        df[col] = le.fit_transform(df[col])

    # Handle missing numerical values by filling with the median
    df['Sleep_Hours'] = df['Sleep_Hours'].fillna(df['Sleep_Hours'].median())
    df['Work_Hours'] = df['Work_Hours'].fillna(df['Work_Hours'].median())
    df['Physical_Activity_Hours'] = df['Physical_Activity_Hours'].fillna(df['Physical_Activity_Hours'].median())

    # Convert numerical columns to appropriate types
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

    return df

In [4]:
# Load and clean data
df = pd.read_csv('../data/mental_health_dataset.csv')
df_cleaned = clean_data(df)

In [5]:
# Select features and target variable
categorical = ['Gender', 'Occupation', 'Country', 'Mental_Health_Condition', 'Severity', 'Consultation_History', 'Stress_Level']
numerical = ['Age', 'Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours']
target = 'Mental_Health_Condition'

In [6]:
# Split data into train/test BEFORE vectorization (🟢 CHANGED)
train_df, test_df = train_test_split(df_cleaned, test_size=0.2, random_state=42)

In [7]:
# Combine features into a single dictionary for DictVectorizer
train_df['features'] = train_df[categorical + numerical].apply(lambda x: x.to_dict(), axis=1)
test_df['features'] = test_df[categorical + numerical].apply(lambda x: x.to_dict(), axis=1)

In [8]:
# Prepare features and target for modeling
dv = DictVectorizer()

X_train = dv.fit_transform(train_df['features'].tolist())  # 🟢 Fit on training data
y_train = train_df[target].values

X_test = dv.transform(test_df['features'].tolist())        # 🟢 Transform test data
y_test = test_df[target].values

In [9]:
# Model training with Lasso
lr = Lasso(alpha=0.1)
lr.fit(X_train, y_train)

In [17]:
# Prediction
y_pred = lr.predict(X_test)

In [18]:
# Evaluation (RMSE)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"RMSE on test set: {rmse}")  # 🟢 Updated to show test RMSE


RMSE on test set: 0.20080083969147652


In [19]:
# Save the model and vectorizer
with open('models/mental_health_model_2.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [20]:
# Log experiment in MLflow
with mlflow.start_run():
    mlflow.set_tag("developer", "Yubin")
    mlflow.log_param("train-data-path", "../data/mental_health_dataset.csv") 
    mlflow.log_param("alpha", 0.01)
    mlflow.log_param("train_size", len(y_train))  
    mlflow.log_param("test_size", len(y_test))   

    mlflow.log_metric("rmse", rmse)
    mlflow.log_artifact(local_path="models/mental_health_model.bin", artifact_path="models_pickle")

In [21]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [22]:
# Prepare DMatrix for XGBoost
train_dmatrix = xgb.DMatrix(X_train, label=y_train)
test_dmatrix = xgb.DMatrix(X_test, label=y_test)

In [27]:
def objective(params):
    """Objective function for Hyperopt tuning"""

    # 🟢 Force correct types for XGBoost
    params['max_depth'] = int(params['max_depth'])  # XGBoost requires int
    params['min_child_weight'] = float(params['min_child_weight'])
    params['reg_alpha'] = float(params['reg_alpha'])
    params['reg_lambda'] = float(params['reg_lambda'])
    params['learning_rate'] = float(params['learning_rate'])

    with mlflow.start_run(nested=True):  # 🟢 Nested runs for each trial
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)

        booster = xgb.train(
            params=params,
            dtrain=train_dmatrix,
            num_boost_round=1000,
            evals=[(test_dmatrix, 'validation')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        y_pred = booster.predict(test_dmatrix)
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [28]:
# -------------------------------
# 🎯 Hyperopt Search Space
# -------------------------------
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 20, 1)),  # 🟢 Smaller search space
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',  # 🟢 Updated deprecated 'reg:linear'
    'seed': 42
}


# -------------------------------
# 🚀 Run Hyperopt
# -------------------------------
best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=30,  # 🟢 Reduced for faster testing
    trials=Trials()
)
print("✅ Best XGBoost Hyperopt Params:", best_result)

100%|██████████| 30/30 [00:03<00:00,  8.14trial/s, best loss: 2.2063595679355785e-05]
✅ Best XGBoost Hyperopt Params: {'learning_rate': np.float64(0.9477136465696124), 'max_depth': np.float64(8.0), 'min_child_weight': np.float64(4.94465806978135), 'reg_alpha': np.float64(0.007258751375563831), 'reg_lambda': np.float64(0.13869944407674337)}


In [29]:
# -------------------------------
# 🏆 Train Final XGBoost Model
# -------------------------------
best_params = {
    'objective': 'reg:squarederror',
    'seed': 42,
    **best_result
}

# 🟢 Force int for final model too
best_params['max_depth'] = int(best_params['max_depth'])

with mlflow.start_run():
    mlflow.set_tag("model", "xgboost-best")
    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train_dmatrix,
        num_boost_round=1000,
        evals=[(test_dmatrix, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )

    y_pred = booster.predict(test_dmatrix)
    rmse = root_mean_squared_error(y_test, y_pred)
    mlflow.log_metric("rmse", rmse)
    print(f"🏁 Final XGBoost RMSE: {rmse}")

    # 🟢 Save preprocessor and booster
    with open("models/preprocessor_xgb.b", "wb") as f_out:
        pickle.dump(dv, f_out)
    mlflow.log_artifact("models/preprocessor_xgb.b", artifact_path="preprocessor")
    mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")




🏁 Final XGBoost RMSE: 2.2063595679355785e-05




In [31]:
# -----------------------------
# 🎯 OTHER SKLEARN MODELS
# -----------------------------
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, LinearSVR):
    with mlflow.start_run():
        model_name = model_class.__name__
        mlflow.set_tag("model", model_name)
        mlflow.log_param("train_size", len(y_train))
        mlflow.log_param("test_size", len(y_test))
        mlflow.log_artifact("models/preprocessor_xgb.b", artifact_path="preprocessor")

        model = model_class()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("rmse", rmse)
        print(f"{model_name} RMSE: {rmse}")

RandomForestRegressor RMSE: 0.0
GradientBoostingRegressor RMSE: 1.3303961797030009e-05
ExtraTreesRegressor RMSE: 0.0




LinearSVR RMSE: 0.00013956828909661757
