In [1]:
!python -V

Python 3.9.23


In [2]:
import pandas as pd

In [3]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("mental-health-experiment")

2025/07/09 04:04:35 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/09 04:04:35 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


<Experiment: artifact_location='/home/binks/ML_Mental_Health_Project/notebooks/mlruns/1', creation_time=1752029757507, experiment_id='1', last_update_time=1752029757507, lifecycle_stage='active', name='mental-health-experiment', tags={}>

In [4]:
import mlflow.sklearn
from mlflow.tracking import MlflowClient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [5]:
df = pd.read_csv('../data/mental_health_dataset.csv')

In [6]:
df

Unnamed: 0,User_ID,Age,Gender,Occupation,Country,Mental_Health_Condition,Severity,Consultation_History,Stress_Level,Sleep_Hours,Work_Hours,Physical_Activity_Hours
0,1,36,Non-binary,Sales,Canada,No,Medium,Yes,Medium,7.1,46,5
1,2,34,Female,Education,UK,Yes,,No,Low,7.5,47,8
2,3,65,Non-binary,Sales,USA,Yes,High,No,Low,8.4,58,10
3,4,34,Male,Other,Australia,No,Low,No,Medium,9.8,30,2
4,5,22,Female,Healthcare,Canada,Yes,Low,No,Medium,4.9,62,5
...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,46,Non-binary,Other,Germany,No,,Yes,Low,9.3,50,0
996,997,31,Male,Sales,Canada,No,,No,High,4.5,37,6
997,998,33,Female,IT,Germany,No,High,Yes,Medium,9.7,73,10
998,999,42,Female,Finance,Australia,Yes,,No,Medium,6.5,79,8


In [7]:
print(df.dtypes)

User_ID                      int64
Age                          int64
Gender                      object
Occupation                  object
Country                     object
Mental_Health_Condition     object
Severity                    object
Consultation_History        object
Stress_Level                object
Sleep_Hours                float64
Work_Hours                   int64
Physical_Activity_Hours      int64
dtype: object


In [8]:
# Drop User_ID (not a predictor)
df = df.drop('User_ID', axis=1)

In [9]:
# Encode target column (Yes=1, No=0)
le = LabelEncoder()
df['Mental_Health_Condition'] = le.fit_transform(df['Mental_Health_Condition'])

In [10]:
# Identify categorical columns (object type)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns to encode:", categorical_cols)

Categorical columns to encode: ['Gender', 'Occupation', 'Country', 'Severity', 'Consultation_History', 'Stress_Level']


In [11]:
# One-hot encode categorical features
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

In [12]:
# Split features and target
X = df.drop('Mental_Health_Condition', axis=1)
y = df['Mental_Health_Condition']

In [13]:
# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [14]:
# Sanity check: all features are numeric
print("\nFeature data types after encoding:\n", X_train.dtypes)


Feature data types after encoding:
 Age                           int64
Sleep_Hours                 float64
Work_Hours                    int64
Physical_Activity_Hours       int64
Gender_Male                    bool
Gender_Non-binary              bool
Gender_Prefer not to say       bool
Occupation_Engineering         bool
Occupation_Finance             bool
Occupation_Healthcare          bool
Occupation_IT                  bool
Occupation_Other               bool
Occupation_Sales               bool
Country_Canada                 bool
Country_Germany                bool
Country_India                  bool
Country_Other                  bool
Country_UK                     bool
Country_USA                    bool
Severity_Low                   bool
Severity_Medium                bool
Consultation_History_Yes       bool
Stress_Level_Low               bool
Stress_Level_Medium            bool
dtype: object


In [15]:
with mlflow.start_run() as run:
    run_id = run.info.run_id
    print(f"🔗 MLflow run ID: {run_id}")

    # -------------------------------
    # 🔥 Train Logistic Regression model
    # -------------------------------
    model = LogisticRegression(max_iter=200, random_state=42)
    model.fit(X_train, y_train)

    # -------------------------------
    # 📊 Evaluate model
    # -------------------------------
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)

    print(f"\n✅ Accuracy: {acc:.4f}, Precision: {prec:.4f}, Recall: {rec:.4f}")

    # -------------------------------
    # 📌 Log parameters and metrics
    # -------------------------------
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 200)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)

    # -------------------------------
    # 📊 Log confusion matrix as artifact
    # -------------------------------
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')

    # Save plot locally and log as artifact
    cm_path = "confusion_matrix.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

    # -------------------------------
    # 💾 Log trained model
    # -------------------------------
    mlflow.sklearn.log_model(model, "model")

    # ==========================================
    # 📦 Register model in Model Registry
    # ==========================================
    model_uri = f"runs:/{run_id}/model"
    registered_model_name = "MentalHealthPredictionModel"

    result = mlflow.register_model(model_uri, registered_model_name)
    print(f"\n📦 Registered model: {registered_model_name} (version {result.version})")

    # ==========================================
    # 🚦 Promote model to Production
    # ==========================================
    client = MlflowClient()
    client.transition_model_version_stage(
        name=registered_model_name,
        version=result.version,
        stage="Production",
        archive_existing_versions=True  # Archive older production models
    )
    print(f"🚀 Promoted model version {result.version} to Production")

🔗 MLflow run ID: 6917559b0518430980c9f1116feaf573

✅ Accuracy: 0.5100, Precision: 0.5000, Recall: 0.6735


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
2025/07/09 04:05:38 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/07/09 04:05:38 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
Successfully registered model 'MentalHealthPredictionModel'.
Created version '1' of model 'MentalHealthPredictionModel'.
  client.transition_model_version_stage(



📦 Registered model: MentalHealthPredictionModel (version 1)
🚀 Promoted model version 1 to Production
