In [38]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import pickle
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from mlflow.models.signature import infer_signature


In [39]:
# Step 2: Load the Excel dataset
data = pd.read_excel("Dataset.xlsx")  # Change path if needed

# Quick view
data.head()


Unnamed: 0,Timestamp,Gender,Age,Your major,"Have you ever been diagnosed with a mental health condition by a professional (doctor, therapist, etc.)?",Have you ever received treatment/support for a mental health problem?,When have you usually gone to bed in the past month?,How long has it taken you to fall asleep each night in the past month?,What time have you usually gotten up in the morning in the past month?,How many hours of actual sleep did you get on an average for the past month? (maybe different from the number of hours spent in bed),...,"During the past month, how often have you had trouble sleeping because you: (choose an option that indicate the most accurate reply for the majority of days and nights in the past month) [Feel too hot]","During the past month, how often have you had trouble sleeping because you: (choose an option that indicate the most accurate reply for the majority of days and nights in the past month) [Have bad dreams]","During the past month, how often have you had trouble sleeping because you: (choose an option that indicate the most accurate reply for the majority of days and nights in the past month) [Have pain]",What is your stress level in these given situations [You have to submit an assignment in less than a day],What is your stress level in these given situations [A week before exams],What is your stress level in these given situations [Asking for an extra ketchup packet at a restaurant],What is your stress level in these given situations [Meeting a new person ],What is your stress level in these given situations [Asking for help],What is your stress level in these given situations [Confronting someone],What is your stress level in these given situations [Doing something without help]
0,2021-11-06 21:55:50.465000,Male,18-20,Mechanical,No,No,9pm-11pm,30 minutes,8 -10 am,7-8 hours,...,Less than once a week,Not during the past month,Not during the past month,moderate,mild,not stressed,not stressed,not stressed,moderate,not stressed
1,2021-11-06 22:00:18.873000,Non-binary,18-20,EEE/ECE,Yes,Yes,1am-3am,More time than 2 hours,after 10 am,7-8 hours,...,Once or twice a week,Three or more times a week,Three or more times a week,severe,moderate,severe,severe,severe,severe,moderate
2,2021-11-06 22:04:49.692000,Non-binary,18-20,Computer Science,Yes,Yes,11pm-1am,30 minutes,before 8 am,7-8 hours,...,Not during the past month,Once or twice a week,Once or twice a week,moderate,moderate,moderate,moderate,moderate,moderate,moderate
3,2021-11-06 22:05:30.780000,Female,18-20,Biotech,No,No,11pm-1am,1 hour,after 10 am,7-8 hours,...,Once or twice a week,Less than once a week,Not during the past month,severe,mild,not stressed,mild,mild,moderate,not stressed
4,2021-11-06 22:07:40.036000,Female,18-20,Computer Science,Yes,No,1am-3am,30 minutes,8 -10 am,more than 8 hours,...,Less than once a week,Less than once a week,Once or twice a week,very severe,very severe,moderate,severe,very severe,very severe,very severe


In [40]:
# Step 3: Clean & map frequency/stress levels

frequency_mapping = {
    'Not during the past month': 0,
    'Less than once a week': 1,
    'Once or twice a week': 2,
    'Three or more times a week': 3
}

stress_level_mapping = {
    'not stressed': 0,
    'mild': 1,
    'moderate': 2,
    'severe': 3,
    'very severe': 4
}
# Convert age ranges to numeric midpoints
age_mapping = {
    '18-20': 19,
    '21-25': 23,
    '26-30': 28,
    '31-35': 33,
    '36-40': 38,
    '41-45': 43,
    '46-50': 48,
    '51-55': 53,
    '56-60': 58,
    '61-65': 63,
    '65+': 68
}

data['Age'] = data['Age'].map(age_mapping)


# Detect columns automatically
frequency_columns = [col for col in data.columns if "During the past month" in col]
stress_columns = [col for col in data.columns if "What is your stress level" in col]

# Map values
for col in frequency_columns:
    data[col] = data[col].map(frequency_mapping)

for col in stress_columns:
    data[col] = data[col].map(stress_level_mapping)


In [41]:
# Step 4: Create target column from stress average

data["average_stress"] = data[stress_columns].mean(axis=1)
data["stress_category"] = pd.cut(
    data["average_stress"],
    bins=[-1, 0.5, 1.5, 2.5, 3.5, 5],
    labels=["very low", "low", "medium", "high", "very high"]
)

# Drop any missing values
data.dropna(inplace=True)


In [42]:
# Step 5: Define features and target

categorical_features = [
    'Gender', 'Your major',
    'Have you ever been diagnosed with a mental health condition by a professional (doctor, therapist, etc.)?',
    'Have you ever received treatment/support for a mental health problem?',
    'When have you usually gone to bed in the past month?',
    'How long has it taken you to fall asleep each night in the past month?',
    'What time have you usually gotten up in the morning in the past month?',
    'How many hours of actual sleep did you get on an average for the past month? (maybe different from the number of hours spent in bed)',
]

numeric_features = ['Age'] + frequency_columns
target = "stress_category"

X = data[categorical_features + numeric_features]
y = data[target]

# Encode target labels
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)


In [43]:
# Step 6: Preprocessing and training pipeline

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
    ("num", StandardScaler(), numeric_features)
])

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(class_weight="balanced", random_state=42))
])


In [44]:
# Step 7: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


In [45]:
# Step 8: Train and log using MLflow

mlflow.set_experiment("Mental_Health_Stress_Prediction")

with mlflow.start_run():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    # Log parameters & metrics
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1_score", f1)

    # Signature & example for reproducibility
    input_example = X_test.iloc[:5]
    signature = infer_signature(X_test, y_pred)

    mlflow.sklearn.log_model(pipeline, "model", signature=signature, input_example=input_example)

    print("✅ Accuracy:", acc)
    print("✅ Precision:", prec)
    print("✅ Recall:", rec)
    print("✅ F1 Score:", f1)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

✅ Accuracy: 0.6291666666666667
✅ Precision: 0.45751422250316054
✅ Recall: 0.6291666666666667
✅ F1 Score: 0.5185250659630607


In [46]:
# Step 9: Save the final model and encoders

with open("model.pkl", "wb") as f:
    pickle.dump({
        "model": pipeline,
        "label_encoder": le_target,
        "target_labels": le_target.classes_,
        "stress_columns": stress_columns
    }, f)

print("🎉 Model and pipeline saved to model.pkl")


🎉 Model and pipeline saved to model.pkl
