In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import mutual_info_classif


df=pd.read_csv('mental_health_workplace_survey.csv')
df.head()

Unnamed: 0,EmployeeID,Age,Gender,Country,JobRole,Department,YearsAtCompany,WorkHoursPerWeek,RemoteWork,BurnoutLevel,...,CommuteTime,HasMentalHealthSupport,ManagerSupportScore,HasTherapyAccess,MentalHealthDaysOff,SalaryRange,WorkLifeBalanceScore,TeamSize,CareerGrowthScore,BurnoutRisk
0,1001,50,Male,UK,Sales Associate,HR,14,47,No,3.37,...,117,No,3.15,Yes,8,40K-60K,8.82,6,9.2,0
1,1002,36,Male,Germany,Software Engineer,IT,1,59,Hybrid,7.39,...,8,Yes,4.4,Yes,4,80K-100K,2.8,45,8.46,1
2,1003,29,Non-binary,India,IT Admin,IT,13,59,Hybrid,7.1,...,75,No,3.63,No,6,80K-100K,7.28,7,7.96,1
3,1004,42,Male,Australia,HR Specialist,IT,15,31,Yes,4.18,...,43,Yes,4.5,Yes,9,60K-80K,1.31,11,8.9,0
4,1005,40,Male,Brazil,Customer Support,Support,6,34,Yes,8.28,...,58,Yes,5.51,Yes,6,<40K,1.17,18,8.88,1


In [17]:
from sklearn.preprocessing import LabelEncoder

df = df.drop(columns=["EmployeeID"], errors='ignore')


df.fillna(df.mode().iloc[0], inplace=True)

le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

In [18]:

X = df.drop("BurnoutRisk", axis=1)
y = df["BurnoutRisk"]

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()


dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
knn.fit(X_train, y_train)


dt_pred = dt.predict(X_test)
rf_pred = rf.predict(X_test)
knn_pred = knn.predict(X_test)


print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))
print("k-NN Accuracy:", accuracy_score(y_test, knn_pred))


Decision Tree Accuracy: 1.0
Random Forest Accuracy: 1.0
k-NN Accuracy: 0.72


In [20]:

importances = rf.feature_importances_
feature_names = X.columns
feat_imp_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='Importance', ascending=False)

print(feat_imp_df)


                   Feature  Importance
8             BurnoutLevel    0.826187
11       ProductivityScore    0.013076
10             StressLevel    0.012589
22       CareerGrowthScore    0.012126
16     ManagerSupportScore    0.011938
13     PhysicalActivityHrs    0.011664
14             CommuteTime    0.011395
9          JobSatisfaction    0.010861
20    WorkLifeBalanceScore    0.010691
21                TeamSize    0.010284
12              SleepHours    0.010244
0                      Age    0.008624
6         WorkHoursPerWeek    0.008028
5           YearsAtCompany    0.007899
2                  Country    0.005593
18     MentalHealthDaysOff    0.005592
3                  JobRole    0.005464
19             SalaryRange    0.004947
4               Department    0.004447
1                   Gender    0.003291
7               RemoteWork    0.002477
17        HasTherapyAccess    0.001417
15  HasMentalHealthSupport    0.001166


In [21]:

top3_features = feat_imp_df['Feature'].head(3).tolist()
print("Top 3 Important Features:", top3_features)

X_train_top3 = X_train[top3_features]
X_test_top3 = X_test[top3_features]


Top 3 Important Features: ['BurnoutLevel', 'ProductivityScore', 'StressLevel']


In [22]:

dt.fit(X_train_top3, y_train)
rf.fit(X_train_top3, y_train)
knn.fit(X_train_top3, y_train)

dt_pred_top3 = dt.predict(X_test_top3)
rf_pred_top3 = rf.predict(X_test_top3)
knn_pred_top3 = knn.predict(X_test_top3)


print("\nAFTER FEATURE SELECTION")
print("Decision Tree Accuracy:", accuracy_score(y_test, dt_pred_top3))
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred_top3))
print("k-NN Accuracy:", accuracy_score(y_test, knn_pred_top3))



AFTER FEATURE SELECTION
Decision Tree Accuracy: 1.0
Random Forest Accuracy: 1.0
k-NN Accuracy: 0.9866666666666667


In [23]:
print(feat_imp_df.head(10))


                 Feature  Importance
8           BurnoutLevel    0.826187
11     ProductivityScore    0.013076
10           StressLevel    0.012589
22     CareerGrowthScore    0.012126
16   ManagerSupportScore    0.011938
13   PhysicalActivityHrs    0.011664
14           CommuteTime    0.011395
9        JobSatisfaction    0.010861
20  WorkLifeBalanceScore    0.010691
21              TeamSize    0.010284
