In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px



In [None]:
burnoutDF = pd.read_excel("/content/employee_burnout_analysis-AI.xlsx")

FileNotFoundError: ignored

In [None]:
burnoutDF["Date of Joining"] = pd.to_datetime(burnoutDF["Date of Joining"])

In [None]:
burnoutDF.shape

In [None]:
burnoutDF.info()

In [None]:
burnoutDF.head()

In [None]:
burnoutDF.columns

In [None]:
burnoutDF.isna().sum()

In [None]:
burnoutDF.duplicated().sum()

In [None]:
burnoutDF.describe()

In [None]:
burnoutDF.drop(["Employee ID"], axis=1, inplace=True)

In [None]:
intfloatburnoutDF = burnoutDF.select_dtypes([np.int, np.float])
for i, col in enumerate(intfloatburnoutDF.columns):
  s = intfloatburnoutDF[col].skew()
  if s>0.1:
    print(f"\n{col} feature is positively skewed and value is {s}")
  elif s<-0.1:
    print(f"\n{col} feature is negatively skewed and value is {s}")
  else:
    print(f"\n{col} feature is normally distributed and value is {s}")


In [None]:
burnoutDF["Resource Allocation"].fillna(burnoutDF["Resource Allocation"].mean(), inplace=True)
burnoutDF["Mental Fatigue Score"].fillna(burnoutDF["Mental Fatigue Score"].mean(), inplace=True)
burnoutDF["Burn Rate"].fillna(burnoutDF["Burn Rate"].mean(), inplace=True)

In [None]:
burnoutDF.corr()

In [None]:
corr = burnoutDF.corr()
sns.set(rc={"figure.figsize":(14,12)})
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.show()

In [None]:
 plt.figure(figsize=(10,8))
 sns.countplot(x="Gender", data=burnoutDF, palette="magma")
 plt.title("Plot distribution of gender")
 plt.show()

In [None]:
 plt.figure(figsize=(10,8))
 sns.countplot(x="Company Type", data=burnoutDF, palette="magma")
 plt.title("Plot distribution of Company Type")
 plt.show()

In [None]:
 plt.figure(figsize=(10,8))
 sns.countplot(x="WFH Setup Available", data=burnoutDF, palette="magma")
 plt.title("Plot distribution of WFH Setup Available")
 plt.show()

In [None]:
burn_st = burnoutDF.loc[:, "Date of Joining": "Burn Rate"]
burn_st = burn_st.select_dtypes([int, float])
for i, col in enumerate(burn_st.columns):
  fig = px.histogram(burn_st, x=col, title=f"plot distribution of {col}", color_discrete_sequence=["indianred"])
  fig.update_layout(bargap = 0.2)
  fig.show()

In [None]:
fig = px.line(burnoutDF, y="Burn Rate", color="Designation", title="Burn rate on the basis of Designation", color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.1)
fig.show()

In [None]:
fig = px.line(burnoutDF, y="Burn Rate", color="Gender", title="Burn rate on the basis of Gender", color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
fig = px.line(burnoutDF, y="Mental Fatigue Score", color="Designation", title="Mental fatigue vs Designation", color_discrete_sequence=px.colors.qualitative.Pastel1)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
sns.relplot(data=burnoutDF, x="Designation", y="Mental Fatigue Score", \
            col="Company Type", hue = "Company Type", size="Burn Rate", \
            style = "Gender", palette=["g","r"], sizes = (50, 200))

In [None]:
from sklearn import preprocessing
Label_encode = preprocessing.LabelEncoder()

In [None]:
burnoutDF["GenderLabel"] = Label_encode.fit_transform(burnoutDF["Gender"].values)
burnoutDF["Company_TypeLabel"] = Label_encode.fit_transform(burnoutDF["Company Type"].values)
burnoutDF["WFH_Setup_AvailableLabel"] = Label_encode.fit_transform(burnoutDF["WFH Setup Available"].values)

In [None]:
gn = burnoutDF.groupby("Gender")
gn = gn["GenderLabel"]
gn.first()

In [None]:
ct = burnoutDF.groupby("Company Type")
ct = ct["Company_TypeLabel"]
ct.first()

In [None]:
burnoutDF.tail(10)

In [None]:
Columns = [
    "Designation", "Resource Allocation", "Mental Fatigue Score",
    "GenderLabel", "Company_TypeLabel", "WFH_Setup_AvailableLabel"
]
X = burnoutDF[Columns]
# X["Mental Fatigue Score"] =( X["Mental Fatigue Score"] - X["Mental Fatigue Score"].mean())/X["Mental Fatigue Score"].std()
# X["Resource Allocation"] =( X["Resource Allocation"] - X["Resource Allocation"].mean())/X["Resource Allocation"].std()
y = burnoutDF["Burn Rate"]

In [None]:
print(X)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(0.95)
x_pca = pca.fit_transform(X)

print(f"PCA Shape of x is: {x_pca.shape} and original shape is: {X.shape}")
print(f"% of importance of selected features is: {pca.explained_variance_ratio_}")
print(f"The number of features selected through PCA is: {pca.n_components_}")

In [None]:
from sklearn.model_selection import train_test_split
x_pca = X
x_train_pca, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.25, random_state=10)

In [None]:
from sklearn.metrics import r2_score

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(x_train_pca, y_train)

train_pred_rf = rf_model.predict(x_train_pca)
train_r2 = r2_score(y_train, train_pred_rf)
test_pred_rf = rf_model.predict(x_test)
test_r2 = r2_score(y_test, test_pred_rf)

print(f"Accuracy score of train data : {round(100*train_r2, 4)} %")
print(f"Accuracy score of test data : {round(100*test_r2, 4)} %")


In [None]:
from sklearn.ensemble import AdaBoostRegressor
abr_model = AdaBoostRegressor(n_estimators=100)
abr_model.fit(x_train_pca, y_train)

train_pred_adboost = abr_model.predict(x_train_pca)
train_r2 = r2_score(y_train, train_pred_adboost)
test_pred_adboost = abr_model.predict(x_test)
test_r2 = r2_score(y_test, test_pred_adboost)
print(f"Accuracy score of train data : {round(100*train_r2, 4)} %")
print(f"Accuracy score of test data : {round(100*test_r2, 4)} %")

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'bootstrap': [True],
    'max_depth': [30, 50],
    'n_estimators': [30, 50, 80,  100]
}
#



In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation,
# search across 100 different combinations, and use all available cores
rf_grid = GridSearchCV(estimator = rf, param_grid = param_grid,  cv = 3, verbose=2, n_jobs = -1)
# Fit the grid search model
rf_grid.fit(x_train_pca, y_train)

In [None]:
train_pred_rf = rf_grid.predict(x_train_pca)
train_r2 = r2_score(y_train, train_pred_rf)
test_pred_rf = rf_grid.predict(x_test)
test_r2 = r2_score(y_test, test_pred_rf)

print(f"Accuracy score of train data : {round(100*train_r2, 4)} %")
print(f"Accuracy score of test data : {round(100*test_r2, 4)} %")