In [None]:
## data source: https://www.kaggle.com/pavansubhasht/ibm-hr-analytics-attrition-dataset
## This is a fictional dataset created by IBM Scientists. 
## EDA and Data Manipulation analysis around reasons for employee attrition

In [1]:
# import all the necessary libraries
import pandas as pd
import numpy as np

In [2]:
# read in the data
df = pd.read_csv("HR-Employee-Attrition.csv")

In [3]:
# snapshot of first 5 rows
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
# shape of data
df.shape

(1470, 35)

In [5]:
# info about data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [6]:
# statistics of data
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,...,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0


In [None]:
## Which department has seen the most attrition?

In [7]:
df.groupby("Department")['Attrition'].value_counts()

Department              Attrition
Human Resources         No            51
                        Yes           12
Research & Development  No           828
                        Yes          133
Sales                   No           354
                        Yes           92
Name: Attrition, dtype: int64

In [8]:
df.query("Attrition=='Yes'").groupby("Department")['Attrition'].value_counts()

Department              Attrition
Human Resources         Yes           12
Research & Development  Yes          133
Sales                   Yes           92
Name: Attrition, dtype: int64

In [9]:
## Research & dev dept has the highest attrition

In [10]:
df['Attrition_mapped'] = (df['Attrition']=='Yes').map({True:1, False:0})

In [11]:
dept_attrition = df.query("Attrition_mapped==1").groupby('Department').agg({'Attrition_mapped':np.sum})

In [12]:
dept_attrition['Attrition_rate'] = 100 * dept_attrition['Attrition_mapped'] / dept_attrition['Attrition_mapped'].sum()

In [13]:
dept_attrition

Unnamed: 0_level_0,Attrition_mapped,Attrition_rate
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Human Resources,12,5.063291
Research & Development,133,56.118143
Sales,92,38.818565


In [14]:
dept_attrition.sort_values('Attrition_rate', ascending=False)

Unnamed: 0_level_0,Attrition_mapped,Attrition_rate
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Research & Development,133,56.118143
Sales,92,38.818565
Human Resources,12,5.063291


In [None]:
## Which Job Role has seen the most attrition?

In [15]:
job_attrition = df.query("Attrition_mapped==1").groupby('JobRole').agg({'Attrition_mapped':np.sum})

In [16]:
job_attrition['Attrition_rate'] = 100 * job_attrition['Attrition_mapped'] / job_attrition['Attrition_mapped'].sum()
job_attrition.sort_values('Attrition_rate', ascending=False)

Unnamed: 0_level_0,Attrition_mapped,Attrition_rate
JobRole,Unnamed: 1_level_1,Unnamed: 2_level_1
Laboratory Technician,62,26.160338
Sales Executive,57,24.050633
Research Scientist,47,19.831224
Sales Representative,33,13.924051
Human Resources,12,5.063291
Manufacturing Director,10,4.219409
Healthcare Representative,9,3.797468
Manager,5,2.109705
Research Director,2,0.843882


In [None]:
## Does income parity across departments have an effect on attrition?
# hint: use mean of Income to check

In [17]:
df.groupby('Department').agg({'MonthlyIncome':np.mean})

Unnamed: 0_level_0,MonthlyIncome
Department,Unnamed: 1_level_1
Human Resources,6654.507937
Research & Development,6281.252862
Sales,6959.172646


In [18]:
inc_attrition = df.query("Attrition_mapped==1").groupby('Department').agg({'Attrition_mapped':'sum', 'MonthlyIncome':np.mean})

In [19]:
inc_attrition['Attrition_rate'] = 100 * inc_attrition['Attrition_mapped'] / inc_attrition['Attrition_mapped'].sum()
inc_attrition

Unnamed: 0_level_0,Attrition_mapped,MonthlyIncome,Attrition_rate
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Human Resources,12,3715.75,5.063291
Research & Development,133,4108.075188,56.118143
Sales,92,5908.456522,38.818565


In [22]:
inc_no_attrition = df.query("Attrition_mapped==0").groupby('Department').agg({'Attrition_mapped':'count', 'MonthlyIncome':np.mean})

In [25]:
inc_no_attrition['retention_rate'] = 100 * inc_no_attrition['Attrition_mapped'] / inc_no_attrition['Attrition_mapped'].sum()
inc_no_attrition

Unnamed: 0_level_0,Attrition_mapped,MonthlyIncome,Attrition_rate,retention_rate
Department,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources,51,7345.980392,4.136253,4.136253
Research & Development,828,6630.326087,67.153285,67.153285
Sales,354,7232.240113,28.710462,28.710462


In [None]:
## Gender v/s monthly income for both attrite and non-attrite employees

In [26]:
attrition_yes = df.query("Attrition_mapped==1").groupby('Gender').agg({'Attrition_mapped':'count', 'MonthlyIncome':np.mean})

In [27]:
attrition_yes['attriton_rate'] = 100 * attrition_yes['Attrition_mapped'] / attrition_yes['Attrition_mapped'].sum()
attrition_yes

Unnamed: 0_level_0,Attrition_mapped,MonthlyIncome,attriton_rate
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,87,4769.735632,36.708861
Male,150,4797.16,63.291139


In [28]:
attrition_no = df.query("Attrition_mapped==0").groupby('Gender').agg({'Attrition_mapped':'count', 'MonthlyIncome':np.mean})

In [29]:
attrition_no['retention_rate'] = 100 * attrition_no['Attrition_mapped'] / attrition_no['Attrition_mapped'].sum()
attrition_no

Unnamed: 0_level_0,Attrition_mapped,MonthlyIncome,retention_rate
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,501,7019.429142,40.632603
Male,732,6704.964481,59.367397
