In [5]:
!python -V

Python 3.9.23


In [6]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import mlflow

In [7]:
df = pd.read_csv('../data/mental_health_dataset.csv')

In [8]:
df

In [9]:
# Data cleaning
# Fill missing severity with 'Unknown'
df['Severity'] = df['Severity'].fillna('Unknown')

In [10]:
# Convert binary columns to 0/1
df['Mental_Health_Condition'] = df['Mental_Health_Condition'].map({'Yes': 1, 'No': 0})
df['Consultation_History'] = df['Consultation_History'].map({'Yes': 1, 'No': 0})

In [11]:
# Select features and target
categorical = ['Gender', 'Occupation', 'Country', 'Severity']
numerical = ['Age', 'Sleep_Hours', 'Work_Hours', 'Physical_Activity_Hours']
target = 'Stress_Level'

In [12]:
# Split data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
# Prepare the features
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

y_train = df_train[target].values

In [14]:
# Train a model (using Random Forest as it often works better with categorical data)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
# Prepare test data
test_dicts = df_test[categorical + numerical].to_dict(orient='records')
X_test = dv.transform(test_dicts)
y_test = df_test[target].values

# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
print(classification_report(y_test, y_pred))

# Feature importance visualization
feature_importances = pd.DataFrame({
    'feature': dv.get_feature_names_out(),
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importances.head(10))
plt.title('Top 10 Important Features for Stress Level Prediction')
plt.show()