<a href="https://colab.research.google.com/github/Charvik143/Employees-Burnout-Analysis-and-Prediction/blob/main/TOMPALA_MOUNIKA_Employees_Burnout_Analaysis_and_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from google.colab import drive
drive.mount('/content/drive')

Loading Dataset

In [None]:
pd.set_option('display.max_columns',None)
burnoutDf=pd.read_csv('/content/drive/MyDrive/employee_burnout_analysis.csv')
burnoutDf


In [None]:
#convert into dateTime dataType
burnoutDf["Date of Joining"]= pd.to_datetime(burnoutDf["Date of Joining"])

In [None]:
# give the number of rows and columns
burnoutDf.shape

In [None]:
# general information
burnoutDf.info()

In [None]:
#show top 5 rows
burnoutDf.head()

In [None]:
#extract all columns of the dataset
burnoutDf.columns

In [None]:
#check for null values
burnoutDf.isna().sum()

In [None]:
#check the duplicates values
burnoutDf.duplicated().sum()

In [None]:
#calculate the mean,std,min,max,count of every attribute
burnoutDf.describe()

In [None]:
#show the unique columns
for i, col in enumerate(burnoutDf.columns):
  print(f"\n\n{burnoutDf[col].unique()}")
  print(f"\n{burnoutDf[col].value_counts()}\n\n")

In [None]:
#drop irrelevant column
burnoutDf= burnoutDf.drop(['Employee ID'],axis=1)


In [None]:
#check the skewness of the attributes
intFloatburnoutDf=burnoutDf.select_dtypes([np.int,np.float])
for i, col in enumerate(intFloatburnoutDf.columns):
  if(intFloatburnoutDf[col].skew()>=0.1):

    print("/n",col,"feature is positivity skewed and values is:",intFloatburnoutDf[col].skew())
  elif(intFloatburnoutDf[col].skew()<= -0.1):
      print("\n",col,"feature is negatively skewed and value is:", intFloatburnoutDf[col].skew())
  else:
    print("\n",col,"feature is normally distributed skewed and value is:", intFloatburnoutDf[col].skew())

In [None]:
# replace the null values with mean
burnoutDf['Resource Allocation'].fillna(burnoutDf['Resource Allocation'].mean(),inplace=True)
burnoutDf['Mental Fatigue Score'].fillna(burnoutDf['Mental Fatigue Score'].mean(),inplace=True)
burnoutDf['Burn Rate'].fillna(burnoutDf['Burn Rate'].mean(),inplace=True)

In [None]:
#check for null values
burnoutDf.isna().sum()

In [None]:
burnoutDf.corr()

Data Visualization

In [None]:
#count plot distribution of "Gender"
plt.figure(figsize=(10,8))
sns.countplot(x="Gender",data=burnoutDf,palette="magma")
plt.title("plot Distribution of Gender")
plt.show()

In [None]:
# count plot distibution of "company type"
plt.figure(figsize=(10,8))
sns.countplot(x="Company Type",data=burnoutDf,palette="Spectral")
plt.title("plot distribution of company type")
plt.show()

In [None]:
#count plot distribution of WFH Setup Availble"
plt.figure(figsize=(10,8))
sns.countplot(x="WFH Setup Available",data=burnoutDf,palette="dark:salmon_r")
plt.title("plot distribution of WFH_Setup_Availble")
plt.show()

In [None]:
# count-Plot Distribution of attributes with the help of Histogram
burn_st=burnoutDf.loc[:,'Date of Joining':'Burn Rate']
burn_st=burn_st.select_dtypes([int,float])
for i, col in enumerate(burn_st.columns):
  fig =px.histogram(burn_st, x=col, title="Plot Distribution of "+col,color_discrete_sequence=["indianred"])
  fig.update_layout(bargap=0.2)
  fig.show()

In [None]:
from IPython.utils import text
#plotting Heat map to check Correlation
Corr=burnoutDf.corr()
sns.set(rc={'figure.figsize':(14,12)})
fig=px.imshow(Corr,text_auto=True,aspect='auto')
fig.show()

In [None]:
# plot distribution of Burn Rate  on the basis of Designation
fig=px.line(burnoutDf, y="Burn Rate", color="Designation",title="Burn rate on the basis of Designation",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
# plot distribution of Burn Rate  on the basis of Gender
fig=px.line(burnoutDf, y="Burn Rate", color="Gender",title="Burn rate on the basis of Gender",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
# plot distribution of mental fatigue  on the basis of Designation
fig=px.line(burnoutDf, y="Mental Fatigue Score",color="Designation",title="Mental Fatigue vs Designation",color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
#Plot Distribution of "Designation" vs mental fatigue" as per Company type ,Burn rate and Gender
sns.relplot(
    data=burnoutDf,x="Designation", y="Mental Fatigue Score",col="Company Type",
    hue="Company Type",size="Burn Rate",style="Gender",
    palette=["g","r"],sizes=(50,200)
)

Label Encoding

In [None]:
# label encoding and assign in new variable
from sklearn import preprocessing
Label_encode = preprocessing.LabelEncoder()

In [None]:
# Assign in new variable
burnoutDf['GenderLabel'] = Label_encode.fit_transform(burnoutDf['Gender'].values)
burnoutDf['Company_TypeLabel'] = Label_encode.fit_transform(burnoutDf['Company Type'].values)
burnoutDf['WFH_Setup_AvailableLabel'] = Label_encode.fit_transform(burnoutDf['WFH Setup Available'].values)

In [None]:
#check assigned values
gn=burnoutDf.groupby('Gender')
gn=gn['GenderLabel']
gn.first()

In [None]:
#check assigned values
ct=burnoutDf.groupby('Company Type')
ct=ct['Company_TypeLabel']
ct.first()

In [None]:
# check assigned values
wsa=burnoutDf.groupby('WFH Setup Available')
wsa=wsa['WFH_Setup_AvailableLabel']
wsa.first()

In [None]:
# show last 10 rows
burnoutDf.tail(10)

Feature selection

In [None]:
# Feature selection
Columns=['Designation',	'Resource Allocation','Mental Fatigue Score','Burn Rate','GenderLabel','Company_TypeLabel','WFH_Setup_AvailableLabel']
X= burnoutDf[Columns]
Y=burnoutDf['Burn Rate']

In [None]:
print(X)

In [None]:
print(Y)

implementing pca

In [None]:
# priciple component Analysis
from sklearn.decomposition import PCA
pca = PCA(0.95)
X_pca = pca.fit_transform(X)

print("PCA shape of X:",X_pca.shape,"and original shape is:",X.shape)
print("% of importance of selected feature is:",pca.explained_variance_ratio_)
print("The number of features selected through PCA is :",pca.n_components)

Data Splitting

In [None]:
# Data Splitting in the train and test
from sklearn.model_selection import train_test_split
X_train_pca, X_test, Y_train , Y_test= train_test_split(X_pca,Y, test_size=0.25,random_state=10)

In [None]:
#print the shape of splitting data
print(X_train_pca.shape,X_test.shape,Y_train.shape,Y_test.shape)

Model Implementation

Random Forest Regression

In [None]:
from sklearn.metrics import r2_score


In [None]:
# random forest regression
from sklearn.ensemble import RandomForestRegressor

rf_model= RandomForestRegressor()
rf_model.fit(X_train_pca,Y_train)
train_pred_rf=rf_model.predict(X_train_pca)
train_r2=r2_score(Y_train,train_pred_rf)
test_pred_rf=rf_model.predict(X_test)
test_r2=r2_score(Y_test,test_pred_rf)
#accuracy score
print("Accuracy score of train data:"+str(round(100*train_r2,4))+"%")
print("Accuracy score of test data:"+str(round(100*test_r2,4))+"%")

In [None]:
#Adaboost regressor
from sklearn.ensemble import AdaBoostRegressor
abr_model = AdaBoostRegressor()
abr_model.fit(X_train_pca,Y_train)

train_pred_adboost = abr_model.predict(X_train_pca)
train_r2= r2_score(Y_train,train_pred_adboost)
test_pred_adaboost = abr_model.predict(X_test)
test_r2=r2_score(Y_test,test_pred_adaboost)

#Accuracy score
print("Accuracy score of train data:"+str(round(100*train_r2,4))+"%")
print("Accuracy score of train data:"+str(round(100*test_r2,4))+"%")

