In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import plotly.graph_objects as go
import plotly.figure_factory as ff

import kagglehub
from kagglehub import KaggleDatasetAdapter

from os import path
from sys import exit
import math

In [None]:
class DatasetLoader:
    def __init__(self):
        self.kaggle_url = "thedevastator/employee-attrition-and-factors"
        
        self.file_name = "HR_Analytics.csv.csv"
        self.dir_name = "dataset"
        
        self.file_path = path.join("..", self.dir_name, self.file_name)
        self.github_url = f'https://raw.githubusercontent.com/AbrarShakhi/employee-attrition-predictor/main/{self.dir_name}/{self.file_name}'
    
    def load_from_kaggle(self):
        try:
            return kagglehub.load_dataset(KaggleDatasetAdapter.PANDAS, self.kaggle_url, self.file_name)
        except:
            return None

    def load_from_github(self):
        try:
            return pd.read_csv(self.github_url)
        except:
            return None

    def load_from_local(self):
        try:
            return pd.read_csv(self.file_path)
        except:
            return None

In [None]:
loader = DatasetLoader()
df = loader.load_from_local()
if df is None:
    df = loader.load_from_github()
if df is None:
    df = loader.load_from_kaggle()
if df is None:
    print("Unable to find data..")
    exit(1)

df.head(5)

In [None]:
print("total number of null values in dataset: ",
      sum(df.isnull().sum().to_numpy())
     )

In [None]:
df.shape

In [None]:
target_column_name = "Attrition"
df[target_column_name].head()

In [None]:
attrition_counts = df[target_column_name].value_counts().reset_index()
attrition_counts.columns = [target_column_name, 'Count']

fig = px.pie(attrition_counts, values='Count', names=target_column_name, 
             title='Attrition Pie chart', width=400, height=400)
fig.show()


# Remove the columns that has only one value


In [None]:
for col in df.columns:
    if len(df[col].unique()) == 1:
        df.drop(col, axis=1, inplace=True)
        print(f"Removed column: {col}")

In [None]:
nominal_columns = []
ordinal_columns = []
other_columns = []

for col in df.columns:
    if df[col].dtype == "object":
        ordinal_columns.append(col)
    elif df[col].dtype != "int64" and df[col].dtype != "float64":
        other_columns.append(col)

for col in [
        "Attrition",
        "BusinessTravel",
        "Department",
        "EducationField",
        "EnvironmentSatisfaction",
        "Gender",
        "JobInvolvement",
        "JobLevel",
        "JobRole",
        "JobSatisfaction",
        "MaritalStatus",
        "OverTime",
        "PerformanceRating",
        "RelationshipSatisfaction",
        "WorkLifeBalance"
    ]:
    if col not in ordinal_columns:
        ordinal_columns.append(col)

for col in df.columns:
    if col not in ordinal_columns and col not in other_columns:
        if df[col].dtype != "object":
            nominal_columns.append(col)
        else:
            other_columns.append(col)

print("Nominal columns: ", nominal_columns)
print("Ordinal columns: ", ordinal_columns)
print("Other columns: ", other_columns)

print("Total nominal columns: ", len(nominal_columns))
print("Total ordinal columns: ", len(ordinal_columns))
print("Total other columns: ", len(other_columns))
print("Nominal + original + other columns: ", len(nominal_columns) + len(ordinal_columns) + len(other_columns))
print("Total columns: ", len(df.columns))

In [None]:
def print_nominal_info(col):
    print("Column name: ", col)
    print("Mean: ", df[col].mean())
    print("Median: ", df[col].median())
    print("varience: ", df[col].var())
    print("std-dev: ", df[col].std())

def print_ordinal_info(col):
    counts = df[col].value_counts()
    print(
        pd.DataFrame({
            "Frequency": counts,
            "Percentage": counts / len(df) * 100
        })
    )

In [None]:
df.describe().transpose()

In [None]:
df.describe(include="object").transpose()

In [None]:
for col in nominal_columns:
    print_nominal_info(col)
    print()

In [None]:
for col in ordinal_columns:
    print_ordinal_info(col)
    print()

# Convert catagorical data into numbers

In [None]:
df_norm = pd.DataFrame(df)
def convert_categorical_to_numerical(col):
    return df_norm[col].astype("category").cat.codes

def convert_categorical_to_numerical_all(columns):
    for col in columns:
        if df_norm[col].dtype == "object":
            df_norm[col] = convert_categorical_to_numerical(col)
    return df_norm

convert_categorical_to_numerical_all(df_norm.columns)
df_norm

### Age
The age of the employee. **Numerical**

In [None]:
fig = px.histogram(df, x="Age", color=target_column_name, barmode="group", )
fig.update_layout()
fig.show()


In [None]:
fig = px.density_heatmap(df, x="Age", y=target_column_name)
fig.update_layout(title="Dencity Heamap with age and Attrition")
fig.show()

### BusinessTravel
The frequency of business travel for the employee. **Categorical**

In [None]:
business_travel_counts = df['BusinessTravel'].value_counts().reset_index()
business_travel_counts.columns = ['BusinessTravel', 'Count']

fig = px.pie(business_travel_counts, values='Count', names='BusinessTravel', 
             title='Business Travel Pie chart', width=400, height=400)
fig.show()

### DailyRate
The daily rate of pay for the employee. **Numerical**

In [None]:
fig = px.box(df, x="DailyRate", title="Box plot of DailyRate")
fig.update_layout(xaxis_title="DailyRate", height=300)
fig.show()

### Department
The department the employee works in. **Categorical**

In [None]:
fig = px.histogram(df, x="Department", title="Department Histogram", color=target_column_name, barmode="group")
fig.update_layout(xaxis_title="Department", height=400, width=400)

fig = px.density_heatmap(df, x="Attrition", y="Department")
fig.show()

### DistanceFromHome
The distance from home in miles for the employee. **Numerical**

In [None]:

sns.histplot(data=df, x="DistanceFromHome", kde=True, color="blue", bins=10)
plt.title("DistanceFromHome Histogram and dencity line")
plt.show()


### Education
The level of education achieved by the employee. **Categorical**

In [None]:
fig = px.scatter_3d(
    df,
    x='Education',
    y='Age',
    z='EmployeeNumber',
    color='Attrition',
    opacity=0.5
)

fig.show()

### EnvironmentSatisfaction
The employee's satisfaction with their work environment. **Categorical**

In [None]:
px.histogram(
    df,
    x="EnvironmentSatisfaction",
    color=target_column_name,
    barmode="group"
).update_layout(
    title="Environment Satisfaction",
    width=300,
    height=300
).show()

### JobSatisfaction
The employee's satisfaction with their job. **Categorical**

In [None]:
fig = px.histogram(df, x="JobSatisfaction", color=target_column_name, 
                   title="Job Satisfaction Histogram", barmode='group')
fig.update_layout(xaxis_title="Job Satisfaction", height=300, width=700)
fig.update_layout(title_text='Job Satisfaction vs number of employee', title_x=.5)
fig.update_traces(textfont_size=12)
fig.update_traces(textfont=dict(color="white"))
fig.show()


### MonthlyIncome
The monthly income of the employee. **Numerical**

In [None]:
px.histogram(
    df, x="MonthlyIncome", color=target_column_name, barmode="group"
).update_layout(
    title_text="Monthly Income vs number of employee",
    title_x=0.5,
    xaxis_title="Monthly Income",
    height=300,
    width=700,
    xaxis=dict(
        tickmode="array",
        tickvals=[0, 10000, 20000, 30000, 40000, 50000],
        ticktext=["0", "10k", "20k", "30k", "40k", "50k"],
    ),
).show()

### OverTime
Whether or not the employee works overtime. **Categorical**

In [None]:
fig = px.scatter(
    df_norm,
    x='EmployeeNumber',
    y="Age",
    color=target_column_name,
    title='P',
    color_continuous_scale='viridis',
)
fig.update_layout(
    title="EmployeeNumber Vs Age With attrition"
)
fig.show()

In [None]:
sns.lineplot(data=df, x="EmployeeNumber", y="YearsSinceLastPromotion", hue=target_column_name)
plt.show()

In [None]:
fig = px.parallel_coordinates(
    df_norm,
    color=target_column_name,
    dimensions=nominal_columns,
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Parallel Coordinates Plot"
)
fig.update_layout(
    title_text="Parallel Coordinates Plot",
    title_x=0.5,
    # width=800,
    # height=600
)
fig.show()

In [None]:
px.scatter_polar(
    df,
    r="Age",
    theta="EmployeeNumber",
    color=target_column_name,
    title="Scatter Polar Plot",
    color_continuous_scale=px.colors.sequential.Inferno,
).update_layout(
    title_text="Scatter Polar Plot",
    title_x=0.5,
    width=800,
    height=600,
).show()



In [None]:

sns.pairplot(
    df,
    hue=target_column_name,
    vars=[
        "Age",
        "DistanceFromHome",
        "JobInvolvement",
        "JobLevel",
        "JobSatisfaction",
        "NumCompaniesWorked",
        "TotalWorkingYears"
    ],
    diag_kind="kde"
)
plt.show()

### PercentSalaryHike, YearsSinceLastPromotion, TotalWorkingYears
The percentage of salary hike for the employee. **Numerical**

Year passed since last promotion **Numerical**

Total Working Years **Numerical**

In [None]:
plt.figure(figsize=(10, 5))

plt.subplot(2, 2, 1)
plt.hist(df["PercentSalaryHike"], bins=20)
plt.title("Percent Salary Hike Histogram")
plt.xlabel("Percent Salary Hike")
plt.ylabel("Number of Employees")

plt.subplot(2, 2, 2)
plt.hist(df["YearsSinceLastPromotion"], bins=20)
plt.title("Years SinceLast Promotion Histogram")
plt.xlabel("Years SinceLast Promotion")
plt.ylabel("Number of Employees")

plt.subplot(2, 2, 3)
plt.hist(df["TotalWorkingYears"], bins=20)
plt.title("Total Working Years Histogram")
plt.xlabel("Total Working Years")
plt.ylabel("Number of Employees")

plt.tight_layout()
plt.show()

### although there is no NULL values, if there are, I whould delete them 

In [None]:
df = df.dropna()

# Correlation HeatMap

In [None]:
# corr_mat = df.corr(method="pearson")
corr_mat = df_norm.corr()

fig = go.Figure(
    data=go.Heatmap(
        z=corr_mat.values,
        x = corr_mat.columns,
        y = corr_mat.index,
        colorscale="Viridis"
    )
)
fig.update_layout(
    title="Corr HeatMap",
    height=600
)
fig.show()