# Employee attrition predictor

Author: [@AbrarShakhi](https://www.kaggle.com/abrarshakhi)

---

## Import nessesary libraries

In [None]:
try:
    import pandas as pd
    import numpy as np
    
    import matplotlib.pyplot as plt
    import plotly.express as px
    import seaborn as sns
    import plotly.graph_objects as go
    import plotly.figure_factory as ff
    
    import kagglehub
    from kagglehub import KaggleDatasetAdapter
    
    from os import path
    from sys import exit
    import math
except:
    %pip install ipywidgets matplotlib numpy pandas plotly scikit-learn seaborn scipy ipykernel jupyterlab kagglehub dash

## A helper class to fetch the dataset
Returns `None` if fails to fetch

In [None]:
class DatasetLoader:
    def __init__(self):
        self.kaggle_url = "thedevastator/employee-attrition-and-factors"
        
        self.file_name = "HR_Analytics.csv.csv"
        self.dir_name = "dataset"
        
        self.file_path = path.join("..", self.dir_name, self.file_name)
        self.github_url = f'https://raw.githubusercontent.com/AbrarShakhi/employee-attrition-predictor/main/{self.dir_name}/{self.file_name}'
    
    def load_from_kaggle(self):
        try:
            return kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, self.kaggle_url, self.file_name)
        except:
            return None

    def load_from_github(self):
        try:
            return pd.read_csv(self.github_url)
        except:
            return None

    def load_from_local(self):
        try:
            return pd.read_csv(self.file_path)
        except:
            return None

## Fetch the dataset and show 5 rows
* fetch from local. in case of failure, fetch from `github`
* in case of failure, fetch from `kaggle`
* in case of failure, exit

In [None]:
loader = DatasetLoader()
data = loader.load_from_local()
if data is None:
    data = loader.load_from_github()
if data is None:
    data = loader.load_from_kaggle()
if data is None:
    print("Unable to find dataset..")
    exit(1)

data.head(5)

## Number of null values in the dataset

In [None]:
print("total number of null values in dataset: ",
    sum(data.isnull().sum().to_numpy())
)

## Drop null contained rows 

In [None]:
data = data.dropna()

## Shape of the dataset

In [None]:
data.shape

## Target column `Attrition`
value that we want to predict

In [None]:
target_column_name = "Attrition"
data[target_column_name].head()

## Defining my copyright 😢

In [None]:
copyright_ply = [
    dict(
        text='© Shakhiul Abrar',
        xref='paper', yref='paper',
        x=0.9, y=-0.1,
        showarrow=False,
        font=dict(size=12)
    )
]
def copyright_plt(plt):
    plt.figtext(0.5, -0.1, '© Shakhiul Abrar', ha='center', fontsize=12)

## Check if the target column `Attrition` is balanced or not
since the target column boolean value, so we will use **PIE** chart

In [None]:
attrition_counts = data[target_column_name].value_counts().reset_index()
attrition_counts.columns = [target_column_name, 'Count']

fig = px.pie(attrition_counts, values='Count', names=target_column_name, 
             title='Attrition Pie chart', width=400, height=400)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Remove columns that has only one value


In [None]:
for col in data.columns:
    if len(data[col].unique()) == 1:
        data.drop(col, axis=1, inplace=True)
        print(f"Removed column: {col}")

## Split Numerical and Categorical columns
and save them in a list

In [None]:
categorical_columns = [
    "Education",
    "EnvironmentSatisfaction",
    "JobInvolvement",
    "JobLevel",
    "JobSatisfaction",
    "PerformanceRating",
    "RelationshipSatisfaction",
    "WorkLifeBalance"
]
for col in data.columns:
    if data[col].dtype == "object":
        categorical_columns.append(col)
categorical_columns = sorted(categorical_columns)

numerical_columns = sorted([col for col in data.columns if col not in categorical_columns])

print("numerical columns:", numerical_columns, "\n")
print("categorical columns:", categorical_columns, "\n")

## A quick info using a fucntion call for number type columns

In [None]:
data[numerical_columns].describe().transpose()

## A quick info using a fucntion call for object type columns

In [None]:
data[categorical_columns].describe(include="object").transpose()

## Dataset info for categorical columns
* Categorical column:
    - Frequency
    - Percentage

In [None]:
for col in categorical_columns:
    counts = data[col].value_counts()
    # cata_datas.append(
    print(
        pd.DataFrame({
            "Frequency": counts,
            "Percentage": counts / len(data) * 100
        })
    )

## Convert catagorical data into numbers
`df` is original dataset,

`df_norm` is converted dataset

In [None]:
data_norm = pd.DataFrame(data)
def convert_categorical_to_numerical(col):
    return data_norm[col].astype("category").cat.codes

def convert_categorical_to_numerical_all(columns):
    for col in columns:
        if data_norm[col].dtype == "object":
            data_norm[col] = convert_categorical_to_numerical(col)
    return data_norm

convert_categorical_to_numerical_all(data_norm.columns)
data_norm

## histogram: `number of employees` in `Age` range group by `Attrition` color
### Age
The age of the employee. **Numerical**

In [None]:
fig = px.histogram(data, x="Age", color=target_column_name, barmode="group", )
fig.update_layout(title="Age Distribution", xaxis_title="Age", yaxis_title="number of employees", title_x=0.5)
fig.update_traces(opacity=0.75)
fig.update_traces(marker_line_width=1.5, marker_line_color="black")
fig.update_traces(marker=dict(line=dict(width=1, color='black')))
fig.update_layout(annotations=copyright_ply)
fig.show()


## HeatMap:`Age`, `Attrition`, `OverTime`, `TotalWorkingYears`

In [None]:
corr_mat= data_norm[["Age", "Attrition", "OverTime", "TotalWorkingYears"]].corr()

sns.heatmap(corr_mat, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Heatmap")
copyright_plt(plt)
plt.show()


## Pie Chart

### BusinessTravel
The frequency of business travel for the employee. **Categorical**

In [None]:
business_travel_counts = data['BusinessTravel'].value_counts().reset_index()
business_travel_counts.columns = ['BusinessTravel', 'Count']

fig = px.pie(business_travel_counts, values='Count', names='BusinessTravel', 
             title='Business Travel Pie chart', width=400, height=400)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Box Plot

### DailyRate
The daily rate of pay for the employee. **Numerical**

In [None]:
fig = px.box(data, x="DailyRate", title="Box plot of DailyRate")
fig.update_layout(xaxis_title="DailyRate", height=300, title_x=0.50)
fig.update_layout(annotations=[
        dict(
            text='© Shakhiul Abrar',
            xref='paper', yref='paper',
            x=0.9, y=1.2,
            showarrow=False,
            font=dict(size=12)
        )
    ]
)
fig.show()

## Histogram: Number of employee count in each `department` group by `Attrition`
### Department
The department the employee works in. **Categorical**

In [None]:
fig = px.histogram(data, x="Department", title="Department Histogram", color=target_column_name, barmode="group")
fig.update_layout(xaxis_title="Department", height=400, width=400)
fig.update_layout(annotations=[
        dict(
            text='© Shakhiul Abrar',
            xref='paper', yref='paper',
            x=0.9, y=1.15,
            showarrow=False,
            font=dict(size=12)
        )
    ]
)
fig.show()

## Density Carve of `Distance from home`
### DistanceFromHome
The distance from home in miles for the employee. **Numerical**

In [None]:

sns.histplot(data=data, x="DistanceFromHome", kde=True, color="blue", bins=10)
plt.title("DistanceFromHome Histogram and dencity line")
plt.ylabel("number of employees")
copyright_plt(plt)
plt.show()


## 3D scatter plot: `Education`, `Age`, `EmployeeNumber`
### Education
The level of education achieved by the employee. **Categorical**

In [None]:
fig = px.scatter_3d(
    data,
    x='Education',
    y='Age',
    z='EmployeeNumber',
    color='Attrition',
    opacity=0.5,
    title="3D Scatter plot of Education, Age and EmployeeNumber",
)
fig.update_layout(
    title_x=0.5
)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Histogram: Number of employee count in each `Environment Satisfaction` level group by `Attrition`
### EnvironmentSatisfaction
The employee's satisfaction with their work environment. **Categorical**

In [None]:
px.histogram(
    data,
    x="EnvironmentSatisfaction",
    color=target_column_name,
    barmode="group"
).update_layout(
    title="Environment Satisfaction",
    width=300,
    height=300
).update_layout(annotations=[
        dict(
            text='© Shakhiul Abrar',
            xref='paper', yref='paper',
            x=0.9, y=1.15,
            showarrow=False,
            font=dict(size=12)
        )
    ]
).show()

## Histogram: Number of employee count in each `job Satisfaction` level group by `Attrition`

### JobSatisfaction
The employee's satisfaction with their job. **Categorical**

In [None]:
fig = px.histogram(data, x="JobSatisfaction", color=target_column_name, 
                   title="Job Satisfaction Histogram", barmode='group')
fig.update_layout(xaxis_title="Job Satisfaction", height=300, width=700)
fig.update_layout(title_text='Job Satisfaction vs number of employee', title_x=.5)
fig.update_traces(textfont_size=12)
fig.update_traces(textfont=dict(color="white"))
fig.update_layout(annotations=[
        dict(
            text='© Shakhiul Abrar',
            xref='paper', yref='paper',
            x=0.9, y=-0.4,
            showarrow=False,
            font=dict(size=12)
        )
    ]
)
fig.show()


## Bar Chart: Total `Working Years` vs `Monthly Income` group by `Attrition`
### MonthlyIncome
The monthly income of the employee. **Numerical**

In [None]:
px.bar(
    data,
    x="TotalWorkingYears",
    y="MonthlyIncome",
    color=target_column_name,
    title="Total Working Years vs Monthly Income",
    barmode="group"
).update_layout(
    title_x=0.5,
    xaxis_title="Total Working Years",
    yaxis_title="Monthly Income"
).update_layout(annotations=copyright_ply).show()

## Correlation HeatMap Monthly Income vs Attrition

In [None]:
fig = px.density_heatmap(data, x="MonthlyIncome", y=target_column_name, color_continuous_scale="Viridis",)
fig.update_layout(title="Dencity Heamap with MonthlyIncome and Attrition", title_x=0.5)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Scatter `Employee number` vs `Years At Company`
### YearsAtCompany
The number of years the employee has been with the company. **Numerical**

In [None]:
fig = px.scatter(
    data_norm,
    x='EmployeeNumber',
    y="YearsAtCompany",
    color=target_column_name,
    title='P',
    color_continuous_scale='viridis',
)
fig.update_layout(
    title="EmployeeNumber Vs YearsAtCompany With attrition"
)
fig.update_layout(annotations=copyright_ply)
fig.show()


## Violin diagram `StockOptionLevel` and `Attrition`
### StockOptionLevel
The stock option level of the employee. **Numerical**

In [None]:
fig = px.violin(
    data,
    y="StockOptionLevel",
    x=target_column_name,
    color=target_column_name,
    box=True,
    points="all",
    title="Stock Option Level vs Attrition"
)
fig.update_layout(
    title_x=0.5,
    xaxis_title="Attrition",
    yaxis_title="Stock Option Level"
)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Line Plot: Employee number vs `YearsSinceLastPromotion`
### YearsSinceLastPromotion
The number of years since the employee's last promotion. **Numerical**

In [None]:
sns.lineplot(data=data, x="EmployeeNumber", y="YearsSinceLastPromotion", hue=target_column_name)
copyright_plt(plt)
plt.show()

## Parallel Coordinates
Shows parallel coordinates graph for all numerical columns

In [None]:
fig = px.parallel_coordinates(
    data_norm,
    color=target_column_name,
    dimensions=numerical_columns,
    color_continuous_scale=px.colors.sequential.Aggrnyl,
    title="Parallel Coordinates Plot"
)
fig.update_layout(
    title_text="Parallel Coordinates Plot",
    title_x=0.5,
    # width=800,
    height=800
)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Polar scatter plot
* radius: `Age`
* theta: `EmployeeNumber`
* color `Attrition`

In [None]:
px.scatter_polar(
    data,
    r="Age",
    theta="EmployeeNumber",
    color=target_column_name,
    title="Scatter Polar Plot",
    color_continuous_scale=px.colors.sequential.Inferno,
).update_layout(
    title_text="Scatter Polar Plot",
    title_x=0.5,
    width=800,
    height=600,
).update_layout(annotations=copyright_ply).show()

## Sankey diagram: `Attrition`, and `MaritalStatus`

In [None]:
columns = ['Attrition', 'MaritalStatus']

labels = pd.unique(data[columns].values.ravel('K')).tolist()

source = []
target = []
value = []

for i in range(len(columns) - 1):
    df_grouped = data.groupby([columns[i], columns[i+1]]).size().reset_index(name='count')
    for _, row in df_grouped.iterrows():
        source.append(labels.index(row[columns[i]]))
        target.append(labels.index(row[columns[i+1]]))
        value.append(row['count'])

link = dict(source=source, target=target, value=value)
node = dict(label=labels, pad=15, thickness=20, line=dict(color="black", width=0.5))

fig = go.Figure(go.Sankey(link=link, node=node, arrangement="snap", textfont=dict(size=10, color="black")))
fig.update_layout(title_text="Sankey Diagram of Attrition, and MaritalStatus", title_x=0.5)
fig.update_layout(annotations=copyright_ply)
fig.show()

## Pair plot `Age`, `DistanceFromHome`, `TotalWorkingYears`

In [None]:
sns.pairplot(
    data,
    hue=target_column_name,
    vars=[
        "Age",
        "DistanceFromHome",
        "TotalWorkingYears"
    ],
    diag_kind="kde"
)
copyright_plt(plt)
plt.show()

## Histogram: PercentSalaryHike, YearsSinceLastPromotion, TotalWorkingYears
The percentage of salary hike for the employee. **Numerical**

Year passed since last promotion **Numerical**

Total Working Years **Numerical**

In [None]:
columns = ["PercentSalaryHike", "YearsSinceLastPromotion", "TotalWorkingYears"]
titles = ["Percent Salary Hike Histogram", "Years Since Last Promotion Histogram", "Total Working Years Histogram"]
xlabels = ["Percent Salary Hike", "Years Since Last Promotion", "Total Working Years"]
colors = ["g", "b", "r"]

plt.figure(figsize=(10, 7))
for i in range(len(columns)):
    plt.subplot(2, 2, i+1)
    sns.histplot(data[columns[i]], bins=10, color=colors[i], edgecolor='black', alpha=0.5, stat='density', kde=True)
    plt.title(titles[i])
    plt.xlabel(xlabels[i])
    plt.ylabel("Density")

plt.tight_layout()
copyright_plt(plt)
plt.show()


## Correlation HeatMap
Corralation heatmap with each column

In [None]:
corr_mat = data_norm.corr(method="pearson")
# corr_mat = df_norm.corr()

fig = go.Figure(
    data=go.Heatmap(
        z=corr_mat.values,
        x = corr_mat.columns,
        y = corr_mat.index,
        colorscale="Viridis",
    )
)
fig.update_layout(
    title="Correlation HeatMap",
    height=600,
    title_x=0.5,
)
fig.update_layout(annotations=[
        dict(
            text='© Shakhiul Abrar',
            xref='paper', yref='paper',
            x=0.9, y=1.1,
            showarrow=False,
            font=dict(size=12)
        )
    ]
)
fig.show()

## Correlation HeatMap with Attrition vs Every other column

In [None]:
corr_mat = data_norm.corr(method="pearson")
corr_mat = corr_mat[target_column_name].sort_values(ascending=False)
fig = px.bar(
    x=corr_mat.values,
    y=corr_mat.index,
    title="Correlation with Attrition",
    color=corr_mat.values,
    color_continuous_scale=px.colors.sequential.Plasma
)
fig.update_layout(
    title_x=0.5,
    xaxis_title="Columns",
    yaxis_title="Correlation with Attrition",
    height=700
)
fig.update_layout(annotations=copyright_ply)
fig.show()