Importing Libraries

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
!pip3 install pandas numpy seaborn matplotlib plotly #For python 3.x
#pip install pandas numpy seaborn matplotlib plotly 
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px



Loadind the dataset

In [None]:
pd.set_option('display.max_columns', None)
burnoutDf = pd.read_csv('data/employee_burnout_analysis-AI.csv')
burnoutDf

In [None]:
#convert date of joining to datetime datatype
burnoutDf['Date of Joining'] = pd.to_datetime(burnoutDf['Date of Joining'])

In [None]:
burnoutDf.info()

In [None]:
burnoutDf.head()

In [None]:
#check for null values
burnoutDf.isnull().sum()

In [None]:
#check for duplicate values
burnoutDf.duplicated().sum()

In [None]:
#calculate the mean, std, min, max and count of every attribute
burnoutDf.describe()

In [None]:
#show the unique values
for i, col in enumerate(burnoutDf.columns):
    print(f"\n\n{i+1}. {col}: {burnoutDf[col].unique()}")
    print(f"\n{burnoutDf[col].value_counts()}\n\n")

In [None]:
#Drop irrelevant columns
burnoutDf = burnoutDf.drop(['Employee ID'],axis=1)

In [None]:
#check for skewness of the attributes
intFloatburnoutDf = burnoutDf.select_dtypes([np.int64, np.float64])
for i,col in enumerate(intFloatburnoutDf.columns):
    if (intFloatburnoutDf[col].skew() >= 0.1):
        print("\n",col,"feature is Positive Skewed and the value is: ", intFloatburnoutDf[col].skew()) 
    elif(intFloatburnoutDf[col].skew() <= -0.1):
        print("\n",col,"feature is Negative Skewed and the value is: ", intFloatburnoutDf[col].skew())
    else:
        print("\n",col,"feature is Normally Distributed and the value is: ", intFloatburnoutDf[col].skew())
    

In [None]:
#Replace the null values with the mean of the column
burnoutDf['Resource Allocation'].fillna(burnoutDf['Resource Allocation'].mean(), inplace=True)
burnoutDf['Mental Fatigue Score'].fillna(burnoutDf['Mental Fatigue Score'].mean(), inplace=True)
burnoutDf['Burn Rate'].fillna(burnoutDf['Burn Rate'].mean(), inplace=True)

In [None]:
#check for null values
burnoutDf.isna().sum()

Data Visualization

In [None]:
numeric_data = burnoutDf.select_dtypes(include=['number'])
numeric_data.corr()

In [None]:
# Show the correlation matrix as a heatmap using plotly
Corr = numeric_data.corr()
plt.figure(figsize=(14, 12))
sns.heatmap(Corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

In [None]:
#Count plot distribution of "Gender"
plt.figure(figsize=(10, 8))
sns.countplot(x="Gender", data=burnoutDf, palette="magma")
plt.title("Plot Distribution of Gender")
plt.show()

In [None]:
#Count plot distribution of "Company Type"
plt.figure(figsize=(10, 8))
sns.countplot(x="Company Type", data=burnoutDf, palette="viridis")
plt.title("Plot Distribution of Company Type")
plt.show()

In [None]:
#Count plot distribution of "Work From Home "
plt.figure(figsize=(10, 8))
sns.countplot(x="WFH Setup Available", data=burnoutDf, palette="dark:salmon_r")
plt.title("Plot Distribution of Work From Home")
plt.show()


In [None]:
import nbformat
print(nbformat.__version__)


In [None]:
#Count plot distribution of attributes with the help of Histogram
burn_st = burnoutDf.loc[:, 'Date of Joining':'Burn Rate']
burn_st = burn_st.select_dtypes(include=[np.number])

for col in burn_st.columns:
    plt.figure(figsize=(8, 4))
    sns.histplot(burn_st[col], bins=20, kde=True, color='skyblue')
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
#plot distribution of Burn rate on the basis of Designation
fig = px.line(
    burnoutDf,
    y="Burn Rate",
    color="Designation",
    title="Burn rate on the basis of Designation",
    color_discrete_sequence=px.colors.qualitative.Pastel1
)

fig.update_layout(bargap=0.1)

fig.write_html("burn_rate_by_designation.html")
print("Plot saved as 'burn_rate_by_designation.html'. Open it in your browser.")


In [None]:
#plot distribution of Burn Rate on the basis of Gender
fig = px.line(
    burnoutDf,
    y="Burn Rate",
    color="Gender",
    title="Burn Rate on the basis of Gender",
    color_discrete_sequence=px.colors.qualitative.Pastel2
)
fig.update_layout(bargap=0.2)

fig.write_html("burn_rate_by_gender.html")
print("Plot saved as 'burn_rate_by_gender.html'. Open it in your browser.")

In [None]:
#plot distribution of mental fatigue score on the basis of Designation
fig = px.line(
    burnoutDf,
    y="Mental Fatigue Score",
    color="Designation",
    title="mental_fatigue_vs_Designation",
    color_discrete_sequence=px.colors.qualitative.Pastel2
)
fig.update_layout(bargap=0.2)

fig.write_html("mental_fatigue_vs_Designation.html")
print("Plot saved as 'mental_fatigue_vs_Designation.html'. Open it in your browser.")

In [None]:
#Plot distribution of "Designation vs mental fatigue score" as per company type, Burn rate and Gender
fig = px.scatter(
    burnoutDf,
    x="Designation",
    y="Mental Fatigue Score",
    color="Company Type",
    size="Burn Rate",
    symbol="Gender",
    color_discrete_sequence=px.colors.qualitative.Set2,
    size_max=60,
    title="Designation vs Mental Fatigue Score by Company Type, Burn Rate, and Gender"
)

fig.write_html("Designation-vs-mental-fatigue-score.html")
print("Plot saved as 'Designation-vs-mental-fatigue-score.html'. Open it in your browser.")



Label Encoding

In [None]:
!pip3 install scikit-learn

#label encoding and assign in the new variable
from sklearn import preprocessing
Label_encode = preprocessing.LabelEncoder()

In [None]:
#Assign in new variable
burnoutDf['GenderLabel'] = Label_encode.fit_transform(burnoutDf['Gender'].values)
burnoutDf['Company TypeLabel'] = Label_encode.fit_transform(burnoutDf['Company Type'].values)
burnoutDf['WFH Setup AvailableLabel'] = Label_encode.fit_transform(burnoutDf['WFH Setup Available'].values)

In [None]:
#check assigned values
gn = burnoutDf.groupby('Gender')
gn = gn['GenderLabel']
gn.first()

In [None]:
#check assigned values
ct = burnoutDf.groupby('Company Type')
ct = ct['Company TypeLabel']
ct.first()

In [None]:
#check assigned values
wsa = burnoutDf.groupby('WFH Setup Available')
wsa = wsa['WFH Setup AvailableLabel']
wsa.first()

In [None]:
#show last 10 rows of the dataframe
burnoutDf.tail(10)

In [None]:
#Feature Selection
Columns = ['Designation','Resource Allocation', 'Mental Fatigue Score', 'Burn Rate', 'GenderLabel', 'Company TypeLabel', 'WFH Setup AvailableLabel']
x = burnoutDf[Columns]
y = burnoutDf['Burn Rate']

In [None]:
print(x)

In [None]:
print(y)

Implementing PCA

In [None]:
from sklearn.decomposition import PCA
 
pca = PCA(0.95)
X_pca = pca.fit_transform(x)

print("PCA shape of X: ", X_pca.shape, "and original shape: ", x.shape)
print("% importance of selected features: ", pca.explained_variance_ratio_)
print("The number of features selected through PCA: ", pca.n_components_)
 

Data Splitting

In [None]:
#Data Splitting into training and testing sets
from sklearn.model_selection import train_test_split
X_train_pca, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.25, random_state=10)


In [None]:
#print the shape of splitted data
print(X_train_pca.shape,X_test.shape,y_train.shape,y_test.shape)

MODEL IMPLEMENTATION

In [None]:
from sklearn.metrics import r2_score

Random Forest Regressor

In [None]:
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(X_train_pca, y_train)

train_pred_rf = rf_model.predict(X_train_pca)
train_r2 = r2_score(y_train, train_pred_rf)
test_pred_rf = rf_model.predict(X_test)
test_r2_rf = r2_score(y_test, test_pred_rf)

#Accuracy Score of Random Forest Regressor
print("Accuracy score of train data using Random Forest Regressor: "+str(round(100 * train_r2, 4))+"%")
print("Accuracy score of test data using Random Forest Regressor: "+str(round(100 * test_r2_rf, 4))+"%")

AdaBoost Regressor

In [None]:
#AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor
abr_model = AdaBoostRegressor()
abr_model.fit(X_train_pca, y_train)

train_pred_adboost = abr_model.predict(X_train_pca)
train_r2 = r2_score(y_train, train_pred_adboost)
test_pred_adaboost = abr_model.predict(X_test)
test_r2 = r2_score(y_test, test_pred_adaboost)

#Accuracy Score of AdaBoost Regressor
print("Accuracy score of train data using AdaBoost Regressor: "+str(round(100 * train_r2, 4))+"%")
print("Accuracy score of test data using AdaBoost Regressor: "+str(round(100 * test_r2, 4))+"%")