In [26]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


# Data Cleaning:

#### 1. Deleting redundant columns.
#### 2. Renaming the columns.
#### 3. Dropping duplicates.
#### 4. Cleaning individual columns.
#### 5. Remove the NaN values from the dataset
#### 6. Check for some more Transformations

In [28]:
df.shape

(1470, 35)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [30]:
df.shape


(1470, 35)

In [31]:
df.duplicated().sum()

np.int64(0)

In [32]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [33]:
columns_to_keep = ['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome', 'EducationField', 'Gender', 'HourlyRate', 'JobRole', 'MaritalStatus', 
                   'MonthlyIncome', 'MonthlyRate', 'YearsAtCompany']
columns_to_drop = ['Education', 'EmployeeCount', 'EmployeeNumber', 'EnvironmentSatisfaction', 'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'NumCompaniesWorked', 'Over18', 
                   'OverTime', 'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear', 
                   'WorkLifeBalance', 'YearsInCurrentRole', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [34]:
df = df[columns_to_keep]

In [35]:
del columns_to_drop

In [36]:
df.drop(columns='DistanceFromHome', inplace=True)
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,MaritalStatus,MonthlyIncome,MonthlyRate,YearsAtCompany
0,41,Yes,Travel_Rarely,1102,Sales,Life Sciences,Female,94,Sales Executive,Single,5993,19479,6
1,49,No,Travel_Frequently,279,Research & Development,Life Sciences,Male,61,Research Scientist,Married,5130,24907,10
2,37,Yes,Travel_Rarely,1373,Research & Development,Other,Male,92,Laboratory Technician,Single,2090,2396,0
3,33,No,Travel_Frequently,1392,Research & Development,Life Sciences,Female,56,Research Scientist,Married,2909,23159,8
4,27,No,Travel_Rarely,591,Research & Development,Medical,Male,40,Laboratory Technician,Married,3468,16632,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel_Frequently,884,Research & Development,Medical,Male,41,Laboratory Technician,Married,2571,12290,5
1466,39,No,Travel_Rarely,613,Research & Development,Medical,Male,42,Healthcare Representative,Married,9991,21457,7
1467,27,No,Travel_Rarely,155,Research & Development,Life Sciences,Male,87,Manufacturing Director,Married,6142,5174,6
1468,49,No,Travel_Frequently,1023,Sales,Medical,Male,63,Sales Executive,Married,5390,13243,9


In [37]:
df.dtypes

Age                int64
Attrition         object
BusinessTravel    object
DailyRate          int64
Department        object
EducationField    object
Gender            object
HourlyRate         int64
JobRole           object
MaritalStatus     object
MonthlyIncome      int64
MonthlyRate        int64
YearsAtCompany     int64
dtype: object

In [38]:
df['BusinessTravel'] = df['BusinessTravel'].str.replace('_', ' ', regex=True)

In [39]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,MaritalStatus,MonthlyIncome,MonthlyRate,YearsAtCompany
0,41,Yes,Travel Rarely,1102,Sales,Life Sciences,Female,94,Sales Executive,Single,5993,19479,6
1,49,No,Travel Frequently,279,Research & Development,Life Sciences,Male,61,Research Scientist,Married,5130,24907,10
2,37,Yes,Travel Rarely,1373,Research & Development,Other,Male,92,Laboratory Technician,Single,2090,2396,0
3,33,No,Travel Frequently,1392,Research & Development,Life Sciences,Female,56,Research Scientist,Married,2909,23159,8
4,27,No,Travel Rarely,591,Research & Development,Medical,Male,40,Laboratory Technician,Married,3468,16632,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,No,Travel Frequently,884,Research & Development,Medical,Male,41,Laboratory Technician,Married,2571,12290,5
1466,39,No,Travel Rarely,613,Research & Development,Medical,Male,42,Healthcare Representative,Married,9991,21457,7
1467,27,No,Travel Rarely,155,Research & Development,Life Sciences,Male,87,Manufacturing Director,Married,6142,5174,6
1468,49,No,Travel Frequently,1023,Sales,Medical,Male,63,Sales Executive,Married,5390,13243,9


In [40]:
df['Attrition'] = df['Attrition'].map({'Yes':1,'No':0})

In [41]:
df

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,MaritalStatus,MonthlyIncome,MonthlyRate,YearsAtCompany
0,41,1,Travel Rarely,1102,Sales,Life Sciences,Female,94,Sales Executive,Single,5993,19479,6
1,49,0,Travel Frequently,279,Research & Development,Life Sciences,Male,61,Research Scientist,Married,5130,24907,10
2,37,1,Travel Rarely,1373,Research & Development,Other,Male,92,Laboratory Technician,Single,2090,2396,0
3,33,0,Travel Frequently,1392,Research & Development,Life Sciences,Female,56,Research Scientist,Married,2909,23159,8
4,27,0,Travel Rarely,591,Research & Development,Medical,Male,40,Laboratory Technician,Married,3468,16632,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel Frequently,884,Research & Development,Medical,Male,41,Laboratory Technician,Married,2571,12290,5
1466,39,0,Travel Rarely,613,Research & Development,Medical,Male,42,Healthcare Representative,Married,9991,21457,7
1467,27,0,Travel Rarely,155,Research & Development,Life Sciences,Male,87,Manufacturing Director,Married,6142,5174,6
1468,49,0,Travel Frequently,1023,Sales,Medical,Male,63,Sales Executive,Married,5390,13243,9


In [42]:
df.to_excel('HR Attrition _ cleaned.xlsx', index=False)

In [43]:
df[df['Attrition']==1]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,MaritalStatus,MonthlyIncome,MonthlyRate,YearsAtCompany
0,41,1,Travel Rarely,1102,Sales,Life Sciences,Female,94,Sales Executive,Single,5993,19479,6
2,37,1,Travel Rarely,1373,Research & Development,Other,Male,92,Laboratory Technician,Single,2090,2396,0
14,28,1,Travel Rarely,103,Research & Development,Life Sciences,Male,50,Laboratory Technician,Single,2028,12947,4
21,36,1,Travel Rarely,1218,Sales,Life Sciences,Male,82,Sales Representative,Single,3407,6986,5
24,34,1,Travel Rarely,699,Research & Development,Medical,Male,83,Research Scientist,Single,2960,17102,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1438,23,1,Travel Frequently,638,Sales,Marketing,Male,33,Sales Representative,Married,1790,26956,1
1442,29,1,Travel Rarely,1092,Research & Development,Medical,Male,36,Research Scientist,Married,4787,26124,2
1444,56,1,Travel Rarely,310,Research & Development,Technical Degree,Male,72,Laboratory Technician,Married,2339,3666,10
1452,50,1,Travel Frequently,878,Sales,Life Sciences,Male,94,Sales Executive,Divorced,6728,14255,6


In [44]:
df[df['Attrition']==0]

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,EducationField,Gender,HourlyRate,JobRole,MaritalStatus,MonthlyIncome,MonthlyRate,YearsAtCompany
1,49,0,Travel Frequently,279,Research & Development,Life Sciences,Male,61,Research Scientist,Married,5130,24907,10
3,33,0,Travel Frequently,1392,Research & Development,Life Sciences,Female,56,Research Scientist,Married,2909,23159,8
4,27,0,Travel Rarely,591,Research & Development,Medical,Male,40,Laboratory Technician,Married,3468,16632,2
5,32,0,Travel Frequently,1005,Research & Development,Life Sciences,Male,79,Laboratory Technician,Single,3068,11864,7
6,59,0,Travel Rarely,1324,Research & Development,Medical,Female,81,Laboratory Technician,Married,2670,9964,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,0,Travel Frequently,884,Research & Development,Medical,Male,41,Laboratory Technician,Married,2571,12290,5
1466,39,0,Travel Rarely,613,Research & Development,Medical,Male,42,Healthcare Representative,Married,9991,21457,7
1467,27,0,Travel Rarely,155,Research & Development,Life Sciences,Male,87,Manufacturing Director,Married,6142,5174,6
1468,49,0,Travel Frequently,1023,Sales,Medical,Male,63,Sales Executive,Married,5390,13243,9
