In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer

In [2]:
data=pd.read_csv("/content/employee_burnout_analysis-AI (1).csv")

In [3]:
data


Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,9/30/2008,Female,Service,No,2,3.0,3.8,0.16
1,fffe3700360033003500,11/30/2008,Male,Service,Yes,1,2.0,5.0,0.36
2,fffe31003300320037003900,3/10/2008,Female,Product,Yes,2,,5.8,0.49
3,fffe32003400380032003900,11/3/2008,Male,Service,Yes,1,1.0,2.6,0.20
4,fffe31003900340031003600,7/24/2008,Female,Service,No,3,7.0,6.9,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,12/30/2008,Female,Service,No,1,3.0,,0.41
22746,fffe33003000350031003800,1/19/2008,Female,Product,Yes,3,6.0,6.7,0.59
22747,fffe390032003000,11/5/2008,Male,Service,Yes,3,7.0,,0.72
22748,fffe33003300320036003900,1/10/2008,Female,Service,No,2,5.0,5.9,0.52


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22750 entries, 0 to 22749
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Employee ID           22750 non-null  object 
 1   Date of Joining       22750 non-null  object 
 2   Gender                22750 non-null  object 
 3   Company Type          22750 non-null  object 
 4   WFH Setup Available   22750 non-null  object 
 5   Designation           22750 non-null  int64  
 6   Resource Allocation   21369 non-null  float64
 7   Mental Fatigue Score  20633 non-null  float64
 8   Burn Rate             21626 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 1.6+ MB


In [5]:
data.isna().sum()

Employee ID                0
Date of Joining            0
Gender                     0
Company Type               0
WFH Setup Available        0
Designation                0
Resource Allocation     1381
Mental Fatigue Score    2117
Burn Rate               1124
dtype: int64

In [6]:
numeric_cols=['Resource Allocation', 'Mental Fatigue Score', 'Burn Rate']

In [11]:
# Assuming 'data' is your original DataFrame before imputation
imputer = KNNImputer(n_neighbors=5)
data[numeric_cols] = imputer.fit_transform(data[numeric_cols])

In [12]:
data.isna().sum()

Employee ID             0
Date of Joining         0
Gender                  0
Company Type            0
WFH Setup Available     0
Designation             0
Resource Allocation     0
Mental Fatigue Score    0
Burn Rate               0
dtype: int64

In [13]:
data

Unnamed: 0,Employee ID,Date of Joining,Gender,Company Type,WFH Setup Available,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate
0,fffe32003000360033003200,9/30/2008,Female,Service,No,2,3.0,3.80,0.16
1,fffe3700360033003500,11/30/2008,Male,Service,Yes,1,2.0,5.00,0.36
2,fffe31003300320037003900,3/10/2008,Female,Product,Yes,2,4.8,5.80,0.49
3,fffe32003400380032003900,11/3/2008,Male,Service,Yes,1,1.0,2.60,0.20
4,fffe31003900340031003600,7/24/2008,Female,Service,No,3,7.0,6.90,0.52
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,12/30/2008,Female,Service,No,1,3.0,5.42,0.41
22746,fffe33003000350031003800,1/19/2008,Female,Product,Yes,3,6.0,6.70,0.59
22747,fffe390032003000,11/5/2008,Male,Service,Yes,3,7.0,7.74,0.72
22748,fffe33003300320036003900,1/10/2008,Female,Service,No,2,5.0,5.90,0.52


In [14]:
# Check if the columns exist before applying get_dummies
if all(col in data.columns for col in ['Company Type', 'WFH Setup Available', 'Gender']):
    data = pd.get_dummies(data, columns=['Company Type', 'WFH Setup Available','Gender'], drop_first=True)
    data.head()
    encoded_columns = data.columns
else:
    print("Error: One or more of the specified columns are not present in the DataFrame.")
    # Add debugging steps here to investigate why the columns are missing.
    # For example, print the existing columns:
    print(data.columns)




In [15]:
data

Unnamed: 0,Employee ID,Date of Joining,Designation,Resource Allocation,Mental Fatigue Score,Burn Rate,Company Type_Service,WFH Setup Available_Yes,Gender_Male
0,fffe32003000360033003200,9/30/2008,2,3.0,3.80,0.16,True,False,False
1,fffe3700360033003500,11/30/2008,1,2.0,5.00,0.36,True,True,True
2,fffe31003300320037003900,3/10/2008,2,4.8,5.80,0.49,False,True,False
3,fffe32003400380032003900,11/3/2008,1,1.0,2.60,0.20,True,True,True
4,fffe31003900340031003600,7/24/2008,3,7.0,6.90,0.52,True,False,False
...,...,...,...,...,...,...,...,...,...
22745,fffe31003500370039003100,12/30/2008,1,3.0,5.42,0.41,True,False,False
22746,fffe33003000350031003800,1/19/2008,3,6.0,6.70,0.59,False,True,False
22747,fffe390032003000,11/5/2008,3,7.0,7.74,0.72,True,True,True
22748,fffe33003300320036003900,1/10/2008,2,5.0,5.90,0.52,True,False,False


In [16]:
data=data.drop("Employee ID",axis=1)

In [17]:
data['Date of Joining']=pd.to_datetime(data['Date of Joining'])
data['join day']=data['Date of Joining'].apply(lambda x:x.day)
data['join month']=data['Date of Joining'].apply(lambda x:x.month)
data=data.drop('Date of Joining',axis=1)

In [19]:
x=data.drop('Burn Rate',axis=1)
y=data['Burn Rate']

In [20]:
x_train , x_test , y_train , y_test = train_test_split(x,y,train_size=0.7, shuffle=True,random_state=1)

scaler=StandardScaler()
scaler.fit(x_train)
x_train=pd.DataFrame(scaler.transform(x_train),index=x_train.index,columns=x_train.columns)
x_test=pd.DataFrame(scaler.transform(x_test),index=x_test.index,columns=x_test.columns)


In [21]:
x_train

Unnamed: 0,Designation,Resource Allocation,Mental Fatigue Score,Company Type_Service,WFH Setup Available_Yes,Gender_Male,join day,join month
18275,-0.159082,0.258017,0.299098,-1.368697,0.919924,-0.954561,0.483902,-1.308890
13833,0.722958,0.258017,0.668279,0.730622,-1.087046,-0.954561,0.483902,1.016663
11163,0.722958,0.258017,0.984721,0.730622,-1.087046,1.047603,-1.454341,-1.308890
9522,-1.041122,-0.727745,-1.335849,-1.368697,0.919924,-0.954561,1.053973,-1.599584
15832,0.722958,-0.234864,-0.281045,0.730622,0.919924,-0.954561,0.711931,0.435274
...,...,...,...,...,...,...,...,...
10955,0.722958,0.258017,0.140877,-1.368697,0.919924,1.047603,1.510030,-1.018196
17289,-0.159082,-0.727745,-0.228304,0.730622,0.919924,1.047603,-1.112298,0.725969
5192,0.722958,0.258017,0.088137,0.730622,0.919924,-0.954561,-0.086169,0.725969
12172,0.722958,1.243779,1.934044,0.730622,-1.087046,-0.954561,0.825945,0.435274


In [22]:
y_train

18275    0.410
13833    0.580
11163    0.560
9522     0.260
15832    0.470
         ...  
10955    0.410
17289    0.380
5192     0.450
12172    0.696
235      0.546
Name: Burn Rate, Length: 15924, dtype: float64

In [23]:
linear_regression_model=LinearRegression()
linear_regression_model.fit(x_train,y_train)

In [24]:
print("Linear regression evaluation")

y_pred=linear_regression_model.predict(x_test)

mse= mean_squared_error(y_test,y_pred)
print("mean squared error:",mse)

rmse=mean_squared_error(y_test,y_pred,squared=False)
print("root mean squared error:",rmse)

mae=mean_absolute_error(y_test,y_pred)
print("mean absolute error:",mae)

r2=r2_score(y_test,y_pred)
print("r2 score:",r2)

Linear regression evaluation
mean squared error: 0.003463769240078876
root mean squared error: 0.058853795460266416
mean absolute error: 0.046946637750718334
r2 score: 0.9116693383581672


In [31]:
models = {
    "Linear Regression": LinearRegression(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Random Forest Regressor": RandomForestRegressor()
}

In [34]:
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{name} evaluation:")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R2 Score: {r2}")
    print("-" * 30)

Training Linear Regression...
Linear Regression evaluation:
Mean Squared Error: 0.003463769240078876
Root Mean Squared Error: 0.058853795460266416
Mean Absolute Error: 0.046946637750718334
R2 Score: 0.9116693383581672
------------------------------
Training K-Neighbors Regressor...
K-Neighbors Regressor evaluation:
Mean Squared Error: 0.004432255092267623
Root Mean Squared Error: 0.06657518375691969
Mean Absolute Error: 0.05255148235448919
R2 Score: 0.8869716780392488
------------------------------
Training Random Forest Regressor...
Random Forest Regressor evaluation:
Mean Squared Error: 0.0033890741771931184
Root Mean Squared Error: 0.0582157554034397
Mean Absolute Error: 0.046029275273634165
R2 Score: 0.9135741604952005
------------------------------
