
# Introduction

Employee attrition is a major challenge for many businesses. It can lead to lost productivity, increased costs, and a decrease in morale. Understanding the factors that contribute to employee attrition is essential for businesses that want to retain their top talent.

# Importing Liabraries

In [None]:
# importing some important library
import pandas as pd
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
import seaborn as sns

# Data Processing

In [None]:
# importing the dataset to Pandas DataFrame: hr_df
hr_df = pd.read_csv("/kaggle/input/ibm-hr-analytics-attrition-dataset/WA_Fn-UseC_-HR-Employee-Attrition.csv")

In [None]:
#display first five rows of the DataFrame: hr_df
hr_df.head()

In [None]:
#display last five rows of the DataFrame: hr_df
hr_df.tail()

In [None]:
# Check the column labels of Pandas DataFrame: hr_df
hr_df.columns

In [None]:
# Get the concise summary of DataFrame: movie_df
hr_df.info()

In [None]:
# Storing the object data type columns in new Dataframe as hr_df1
hr_df1 = hr_df.select_dtypes('object')
hr_df1

In [None]:
# Storing the int data type column in new Dataframe as hr_df2
hr_df2 = hr_df.select_dtypes('int64')
hr_df2

In [None]:
# checking the Statistical summary of the objective type of Data in Dataframe:hr_df1
hr_df1.describe()

In [None]:
# checking the Statistical summary of the int type of Data in Dataframe:hr_df2
hr_df2.describe().T

In [None]:
# checking the Statistical Summary of dataframe: hr_df
hr_df.describe().T

# Cleaning Data

In [None]:
# checking is there any null value in the dataframe: hr_df
hr_df.isna().any()

In [None]:
# checking the sum of null values in dataframe : hr_df
hr_df.isna().sum()

In [None]:
# Checking the Duplicate Value in the Dataframe:hr_df
have_duplicated_row = hr_df.duplicated().any()
have_duplicated_row

 **Some of columns can be removed, because their values do not affect the analysis results.**
* Over18: All values are Y

* EmployeeCount: all values are 1.0

* StandardHours: all values are 80.0
EmployeeNumber: is the id of the employee that their values do not affect the analysis results

In [None]:
hr_df = hr_df.drop(['Over18','EmployeeCount','StandardHours','EmployeeNumber'],axis=1)

# Exploratory Data Analysis

In [None]:
# Create a histogram of all columns in the DataFrame
hr_df.hist(figsize=(15, 15))

# Tighten the layout of the plot
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Create a kernel density plot of the Age column for active and ex-employees
sns.kdeplot(
    hr_df.loc[hr_df['Attrition'] == 'No', 'Age'],
    label='Active Employee'
)
sns.kdeplot(
    hr_df.loc[hr_df['Attrition'] == 'Yes', 'Age'],
    label='Ex Employee'
)

# Add a legend and show the plot
plt.legend()
plt.show()

In [None]:
# Create a boxplot of MonthlyIncome by JobRole
ax = sns.boxplot(
    y="MonthlyIncome",
    x="JobRole",
    showmeans=True,
    data=hr_df
)

# Rotate the x-axis labels to prevent overlapping
plt.xticks(rotation=90)

# Add a grid to the plot
plt.grid(True, alpha=1)

# Tighten the layout of the plot
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# Get the categorical and numerical columns
categorical_columns = hr_df.select_dtypes(['object']).columns
numerical_columns = hr_df.select_dtypes(['number']).columns

# Print the categorical and numerical columns
print(categorical_columns)
print(numerical_columns)

In [None]:
# Set the background color of plots to white and the figure color to #9ed9cd
sns.set(rc={'axes.facecolor': 'White', 'figure.facecolor': '#9ed9cd'})
sns.set_palette('pastel')

# Iterate over the categorical columns
for i, col in enumerate(categorical_columns):
    # Create a figure and two axes
    fig, axes = plt.subplots(1, 2, figsize=(13, 5))

    # Create a countplot of the categorical column in the first axis
    ax = sns.countplot(data=hr_df, x=col, ax=axes[0])

    # Rotate the x-axis labels to prevent overlapping
    activities = [var for var in hr_df[col].value_counts().sort_index().index]
    ax.set_xticklabels(activities, rotation=90)

    # Add bar labels to the countplot
    for container in axes[0].containers:
        axes[0].bar_label(container)

    # Get the pie chart labels and sizes
    index = hr_df[col].value_counts().index
    size = hr_df[col].value_counts().values

    # Create the pie chart
    axes[1].pie(
        size,
        labels=index,
        autopct='%1.1f%%',
        pctdistance=0.85,
        colors=sns.color_palette('pastel'),
    )

    # Add a white inner circle to the pie chart
    centre_circle = plt.Circle((0, 0), 0.70, fc='White')
    fig = plt.gcf()
    fig.gca().add_artist(centre_circle)

    # Add a suptitle to the figure
    plt.suptitle(col, backgroundcolor='Black', color='white', fontsize=15)

    # Show the plot
    plt.show()

In [None]:
# Iterate over the categorical columns
for col in categorical_columns:
    # Create a figure
    fig = plt.figure(figsize=(10, 5))

    # Create a countplot of the categorical column with Attrition as the hue
    ax = sns.countplot(x=hr_df[col], data=hr_df, hue='Attrition')

    # Add bar labels to the countplot
    for container in ax.containers:
        ax.bar_label(container)

    # Set the title, labels, and grid
    plt.title(col, backgroundcolor='black', color='white', fontsize=20)
    plt.xticks(rotation=90)
    plt.xlabel(col, fontsize=20)
    plt.ylabel('Count', fontsize=20)
    plt.grid(True)

    # Show the plot
    plt.show()

In [None]:
# Create a figure
fig = plt.figure(figsize=(15, 30))

# Iterate over the numerical columns
for idx, i in enumerate(numerical_columns):
    # Create a subplot
    ax = fig.add_subplot(6, 4, idx + 1)

    # Create a boxplot of the numerical column
    sns.boxplot(x=i, data=hr_df, ax=ax)

    # Set the title, label, and grid
    ax.set_title(i, backgroundcolor='black', color='white', fontsize=20)
    ax.set_xlabel(i, fontsize=12)
    ax.grid(True)

# Tighten the layout of the figure
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
# Create a figure
fig = plt.figure(figsize=(10, 5))

# Create a countplot of the JobLevel column with Attrition as the hue
ax = sns.countplot(x="JobLevel", data=hr_df, hue="Attrition")

# Add bar labels to the countplot
for container in ax.containers:
    ax.bar_label(container)

# Set the title, labels, and grid
plt.title("Job level", backgroundcolor='black', color='white', fontsize=20)
plt.xlabel("Job level", fontsize=20)
plt.ylabel('Count', fontsize=20)
plt.grid(True)

# Show the plot
plt.show()

In [None]:
# Create a figure
fig = plt.figure(figsize=(5, 10))

# Create a relational plot of MonthlyIncome vs. Age, colored by Attrition and split by Gender
sns.relplot(
    data=hr_df,
    y="MonthlyIncome",
    x="Age",
    hue="Attrition",
    col="Gender",
    kind="scatter",
    col_wrap=2,
)

# Show the plot
plt.show()


In [None]:
# Create a list of variables to plot
variables = ["MaritalStatus", "Department", "Education", "JobRole", "BusinessTravel"]

# Iterate over the variables and create a relational plot for each one
for variable in variables:
    fig = plt.figure(figsize=(5, 10))

    sns.relplot(
        data=hr_df,
        y="MonthlyIncome",
        x=variable,
        hue="Attrition",
        col="Gender",
        kind="scatter",
        col_wrap=2,
    )

    # Rotate the x-axis labels for the JobRole plot
    if variable == "JobRole":
        rotation = 90
        for i, ax in enumerate(fig.axes):
            ax.set_xticklabels(ax.get_xticklabels(), rotation=rotation)

    plt.xticks(rotation=95)
    plt.yticks(rotation=95)

    plt.show()

 # Analysis of graphs

* Attrition is the highest for both men and women from 18 to 35 years of age and gradually decreases.

* As income increases, attrition decreases.

* Attrition is much, much less in divorced women.

* Attrition is higher for employees who usually travel than others, and this rate is higher for women than for men.

* Attrition is the highest for those in level 1 jobs.

* Women with the job position of manager, research director and technician laboratory have almost no attrition.

* Men with the position of sales expert have a lot of attrition.
