In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

df = pd.read_csv('mental_health_workplace_survey.csv')



X = df.drop(columns=['BurnoutRisk', 'EmployeeID'])
y = df['BurnoutRisk']

# Select categorical columns for one-hot encoding
categorical_cols = ['Gender', 'Country', 'JobRole', 'Department', 'SalaryRange', 'RemoteWork']

# Apply one-hot encoding
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

for col in X.columns:
    if X[col].dtype == 'object':
        X[col] = pd.to_numeric(X[col], errors='coerce')



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest for feature importances
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(model.feature_importances_, index=X.columns)
top3 = importances.sort_values(ascending=False).head(3)
print("Top 3 Features:\n", top3)

# Now train a model with ONLY those 3
X_train_top3 = X_train[top3.index]
X_test_top3 = X_test[top3.index]

minimal_model = RandomForestClassifier(random_state=42)
minimal_model.fit(X_train_top3, y_train)
y_pred = minimal_model.predict(X_test_top3)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with 3 features: {accuracy:.4f}")

Top 3 Features:
 BurnoutLevel           0.757049
ManagerSupportScore    0.016488
ProductivityScore      0.016377
dtype: float64
Accuracy with 3 features: 1.0000
