In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

df = pd.read_csv('../data/cleaned_survey.csv')

In [8]:
#Features
FEATURES = [
  'Age','Gender','self_employed','family_history','no_employees',
  'remote_work','tech_company','benefits','mental_health_consequence'
]
X = pd.get_dummies(df[FEATURES], drop_first=True)
y = df['RiskLevel']

scaler = StandardScaler()
X['Age'] = scaler.fit_transform(X[['Age']])

In [9]:
#Train/Test Split & Grid Search
X_train, X_test, y_train, y_test = train_test_split(
  X, y, stratify=y, test_size=0.2, random_state=42
)

# Hyperparameter tuning
param_grid = {
  'n_estimators': [100, 200],
  'max_depth': [None, 10, 20],
  'min_samples_leaf': [1, 2, 4]
}
rf = RandomForestClassifier(random_state=42)
grid = GridSearchCV(rf, param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print(classification_report(y_test, grid.predict(X_test)))

Best params: {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

        High       0.00      0.00      0.00        29
         Low       0.80      0.10      0.17        42
    Moderate       0.65      0.99      0.79       128

    accuracy                           0.66       199
   macro avg       0.48      0.36      0.32       199
weighted avg       0.59      0.66      0.54       199



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
best_rf = grid.best_estimator_
joblib.dump(best_rf, '../models/rf_model.pkl')
joblib.dump(scaler,  '../models/scaler.pkl')
print("Models saved.")

Models saved.


In [11]:
# After grid.best_estimator_ is assigned to best_rf
import numpy as np

importances = best_rf.feature_importances_           # array of length n_features
feature_names = X_train.columns                      # the corresponding column names

# Sort features by importance (highest first)
indices = np.argsort(importances)[::-1]

# Print the top 4
for rank in range(4):
    feat = feature_names[indices[rank]]
    imp  = importances[indices[rank]]
    print(f"{rank+1}. {feat} ({imp:.3f})")

1. family_history_Yes (0.175)
2. remote_work_Yes (0.070)
3. mental_health_consequence_Yes (0.068)
4. benefits_Yes (0.066)
