In [12]:
# import the libraries
import pandas as pd
import matplotlib as plt
import numpy as np
import scipy.stats as st
from scipy.stats import linregress

In [13]:
# Read CSV file
complete_data = pd.read_csv('HR-Employee-Attrition.csv', low_memory=False)
complete_data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [14]:
# Check for data types
complete_data.dtypes

Age                          int64
Attrition                   object
BusinessTravel              object
DailyRate                    int64
Department                  object
DistanceFromHome             int64
Education                    int64
EducationField              object
EmployeeCount                int64
EmployeeNumber               int64
EnvironmentSatisfaction      int64
Gender                      object
HourlyRate                   int64
JobInvolvement               int64
JobLevel                     int64
JobRole                     object
JobSatisfaction              int64
MaritalStatus               object
MonthlyIncome                int64
MonthlyRate                  int64
NumCompaniesWorked           int64
Over18                      object
OverTime                    object
PercentSalaryHike            int64
PerformanceRating            int64
RelationshipSatisfaction     int64
StandardHours                int64
StockOptionLevel             int64
TotalWorkingYears   

In [15]:
# Check for the number of the values (to see if we have missing value)
complete_data.count()

Age                         1470
Attrition                   1470
BusinessTravel              1470
DailyRate                   1470
Department                  1470
DistanceFromHome            1470
Education                   1470
EducationField              1470
EmployeeCount               1470
EmployeeNumber              1470
EnvironmentSatisfaction     1470
Gender                      1470
HourlyRate                  1470
JobInvolvement              1470
JobLevel                    1470
JobRole                     1470
JobSatisfaction             1470
MaritalStatus               1470
MonthlyIncome               1470
MonthlyRate                 1470
NumCompaniesWorked          1470
Over18                      1470
OverTime                    1470
PercentSalaryHike           1470
PerformanceRating           1470
RelationshipSatisfaction    1470
StandardHours               1470
StockOptionLevel            1470
TotalWorkingYears           1470
TrainingTimesLastYear       1470
WorkLifeBa

In [16]:
# Dropping unwanted columns
cleaned_data = complete_data.drop(columns=['BusinessTravel', 'EmployeeCount', 'DistanceFromHome',
                                             'Over18', 'StockOptionLevel', 'StandardHours'])
# Assign EmployeeNumber as an index
cleaned_data.set_index('EmployeeNumber', inplace=True)

cleaned_data.head()

Unnamed: 0_level_0,Age,Attrition,DailyRate,Department,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,41,Yes,1102,Sales,2,Life Sciences,2,Female,94,3,...,11,3,1,8,0,1,6,4,0,5
2,49,No,279,Research & Development,1,Life Sciences,3,Male,61,2,...,23,4,4,10,3,3,10,7,1,7
4,37,Yes,1373,Research & Development,2,Other,4,Male,92,2,...,15,3,2,7,3,3,0,0,0,0
5,33,No,1392,Research & Development,4,Life Sciences,4,Female,56,3,...,11,3,3,8,3,3,8,7,3,0
7,27,No,591,Research & Development,1,Medical,1,Male,40,3,...,12,3,4,6,3,3,2,2,2,2


In [17]:
# selecting numeric columns to crate a summary statistic table
numeric_cols = cleaned_data.select_dtypes(include=['number']).columns
# Getting the mean, median, var, std, sem for each column
summary_table = cleaned_data[numeric_cols].agg(['mean', 'median', 'var', 'std', 'sem'])
summary_table

Unnamed: 0,Age,DailyRate,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,...,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
mean,36.92381,802.485714,2.912925,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931,14313.1,...,15.209524,3.153741,2.712245,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
median,36.0,802.0,3.0,3.0,66.0,3.0,2.0,3.0,4919.0,14235.5,...,14.0,3.0,3.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
var,83.455049,162819.593737,1.048914,1.194829,413.285626,0.506319,1.225316,1.21627,22164860.0,50662880.0,...,13.395144,0.130194,1.169013,60.540563,1.662219,0.499108,37.53431,13.127122,10.384057,12.731595
std,9.135373,403.5091,1.024165,1.093082,20.329428,0.711561,1.10694,1.102846,4707.957,7117.786,...,3.659938,0.360824,1.081209,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
sem,0.238269,10.524335,0.026712,0.02851,0.530233,0.018559,0.028871,0.028764,122.7931,185.6463,...,0.095459,0.009411,0.0282,0.202939,0.033627,0.018426,0.159792,0.094499,0.084048,0.093064


In [18]:
# Check the values of Department column
cleaned_data['Department'].value_counts()

Department
Research & Development    961
Sales                     446
Human Resources            63
Name: count, dtype: int64

In [19]:
# Check the value for MaritalStatus column
cleaned_data['MaritalStatus'].value_counts()

MaritalStatus
Married     673
Single      470
Divorced    327
Name: count, dtype: int64

In [20]:
# Check the value for EducationField column
cleaned_data['EducationField'].value_counts()

EducationField
Life Sciences       606
Medical             464
Marketing           159
Technical Degree    132
Other                82
Human Resources      27
Name: count, dtype: int64