# **General Imports:**

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


# **Load Dataset:**

In [8]:
data = pd.read_excel('/employee_burnout_analysis-AI.xlsx')
display(data.head())

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3,7.0,6.9,0.52


In [9]:
data.head()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,2008-09-30,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,2008-11-30,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,2008-03-10,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,2008-11-03,Male,Service,Yes,1,1.0,2.6,0.2
4,fffe31003900340031003600,2008-07-24,Female,Service,No,3,7.0,6.9,0.52


In [10]:
data.tail()

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
22745,fffe31003500370039003100,2008-12-30,Female,Service,No,1,3.0,,0.41
22746,fffe33003000350031003800,2008-01-19,Female,Product,Yes,3,6.0,6.7,0.59
22747,fffe390032003000,2008-11-05,Male,Service,Yes,3,7.0,,0.72
22748,fffe33003300320036003900,2008-01-10,Female,Service,No,2,5.0,5.9,0.52
22749,fffe3400350031003800,2008-01-06,Male,Product,No,3,6.0,7.8,0.61


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   Employee ID           22750 non-null  object        
 1   Date of Joining       22750 non-null  datetime64[ns]
 2   Gender                22750 non-null  object        
 3   Company Type          22750 non-null  object        
 4   WFH Setup Available   22750 non-null  object        
 5   Designation           22750 non-null  int64         
 6   Resource Allocation   21369 non-null  float64       
 7   Mental Fatigue Score  20633 non-null  float64       
 8   Burn Rate             21626 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(4)
memory usage: 1.6+ MB


In [13]:
data.describe()

Unnamed: 0,Date of Joining,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
count,22750,22750.0,21369.0,20633.0,21626.0
mean,2008-07-01 09:28:05.274725120,2.178725,4.481398,5.728188,0.452005
min,2008-01-01 00:00:00,0.0,1.0,0.0,0.0
25%,2008-04-01 00:00:00,1.0,3.0,4.6,0.31
50%,2008-07-02 00:00:00,2.0,4.0,5.9,0.45
75%,2008-09-30 00:00:00,3.0,6.0,7.1,0.59
max,2008-12-31 00:00:00,5.0,10.0,10.0,1.0
std,,1.135145,2.047211,1.920839,0.198226


In [14]:
data.isnull().sum()

Unnamed: 0,0
Employee ID,0
Date of Joining,0
Gender,0
Company Type,0
WFH Setup Available,0
Designation,0
Resource Allocation,1381
Mental Fatigue Score,2117
Burn Rate,1124


# **Prepare target Variable:**

In [15]:
for col in data.columns:
    if np.issubdtype(data[col].dtype, np.datetime64):
        data.drop(col, axis=1, inplace=True)


In [16]:
data = data.dropna()

In [17]:
data = pd.get_dummies(data, drop_first=True)

In [18]:
target_col = 'Burn Rate'
median_val = data[target_col].median()
data[target_col] = (data[target_col] > median_val).astype(int)


# **Train-Test Splitting:**

In [19]:
X = data.drop(target_col, axis=1)
y = data[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


# **Model-1: Linear Regression:**

In [20]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [21]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

In [22]:
y_pred_lr = (lr.predict(X_test_scaled) > 0.5).astype(int)

print("Linear Regression Accuracy:")
print(accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Linear Regression Accuracy:
0.8821947283485745
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      1880
           1       0.88      0.88      0.88      1838

    accuracy                           0.88      3718
   macro avg       0.88      0.88      0.88      3718
weighted avg       0.88      0.88      0.88      3718



# **Model-2: Random Forest**

In [23]:
rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    random_state=42
)

rf.fit(X_train, y_train)


In [24]:
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:")
print(accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy:
0.8466917697686929
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      1880
           1       0.84      0.85      0.85      1838

    accuracy                           0.85      3718
   macro avg       0.85      0.85      0.85      3718
weighted avg       0.85      0.85      0.85      3718



In [25]:
print("Linear Regression vs Random Forest")
print("Linear Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Linear Regression vs Random Forest
Linear Regression Accuracy: 0.8821947283485745
Random Forest Accuracy: 0.8466917697686929
