Importing librarires

In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

Importing Machine Learning Libraries

In [2]:
import sklearn
from sklearn.model_selection import train_test_split

In [3]:
#importing warnings
import warnings 
warnings.filterwarnings('ignore')

In [4]:
hr = pd.read_csv('HR_comma_sep.csv')
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [5]:
hr.rename(columns = {"left": "Attrition"}, 
          inplace = True)

In [6]:
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,Attrition,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [7]:
hr.shape

(14999, 10)

In [8]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   Attrition              14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [9]:
hr.isna().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
Attrition                0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

Note: No null values in our dataset

In [10]:
hr['salary'].value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

In [11]:
hr['salary'].nunique()

3

In [12]:
hr['Department'].value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: Department, dtype: int64

### EDA- Exploratory Data Analysis

In [13]:
## let's find out the number of people Attrition the organization
Attrition = hr[hr.Attrition==1]
Attrition.shape

(3571, 10)

Observation: Out of total employees 3571 Attrition the company

In [14]:
## retained talent
retained = hr[hr.Attrition==0]
retained.shape

(11428, 10)

In [15]:
# we will use groupby on Attrition and find out the average of the rows
# as average represents the entire population
hr.groupby('Attrition').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
Attrition,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


**Satisfaction Level**: Satisfaction level seems to be relatively low (0.44) in employees leaving the firm vs the retained ones (0.66)

**Average Monthly Hours**: Average monthly hours are higher in employees leaving the firm (199 vs 207)

**Promotion Last 5 Years**: Employees who are given promotion are likely to be retained at firm


### DATA Preparation

## Important Steps:
- dummy variables
- rescaling
- model logistic regression


#### Steps for Dummy variables
- pd.get_dummies(df['column name'],drop_first=True)
- concat the original dataframe and dummy dataframe (hr = pd.concat([hr,salary_dummy],axis=1))
- drop the column which whose dummies were created ie salary(hr.drop([''],axis=1))

In [16]:
#creating dummy for salary
salary_dummy = pd.get_dummies(hr['salary'],drop_first=True)
salary_dummy.head()

Unnamed: 0,low,medium
0,1,0
1,0,1
2,0,1
3,1,0
4,1,0


In [17]:
# Add the results to the original hr dataframe
hr = pd.concat([hr,salary_dummy],axis=1)
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,Attrition,promotion_last_5years,Department,salary,low,medium
0,0.38,0.53,2,157,3,0,1,0,sales,low,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,medium,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,low,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,1,0


In [18]:
#drop the column which whose dummies were created ie salary
hr=hr.drop(['salary'],axis=1)
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,Attrition,promotion_last_5years,Department,low,medium
0,0.38,0.53,2,157,3,0,1,0,sales,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,1,0


In [19]:
#Now creating the dummies for departments
department_dummy=pd.get_dummies(hr['Department'],drop_first=True)
department_dummy.head()

Unnamed: 0,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,1,0,0


In [20]:
#merge or concat with the original dataframe
hr = pd.concat([hr,department_dummy],axis=1)
hr.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,Attrition,promotion_last_5years,Department,low,medium,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,1,0,sales,1,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,sales,0,1,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,1,0,sales,0,1,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,1,0,sales,1,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,sales,1,0,0,0,0,0,0,0,1,0,0


In [21]:
#drop the department column
hr.drop('Department',axis=1,inplace=True)

In [22]:
hr.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'Attrition', 'promotion_last_5years', 'low', 'medium', 'RandD',
       'accounting', 'hr', 'management', 'marketing', 'product_mng', 'sales',
       'support', 'technical'],
      dtype='object')

## We do rescaling after train-test split 

In [23]:
#Train-Test split
# Before that we need to specify the dependent (X) and independent variables(Y)
#dependent variables
y=hr.pop('Attrition')
y.head()

0    1
1    1
2    1
3    1
4    1
Name: Attrition, dtype: int64

In [24]:
#independent variables
X=hr
X.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,low,medium,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
0,0.38,0.53,2,157,3,0,0,1,0,0,0,0,0,0,0,1,0,0
1,0.8,0.86,5,262,6,0,0,0,1,0,0,0,0,0,0,1,0,0
2,0.11,0.88,7,272,4,0,0,0,1,0,0,0,0,0,0,1,0,0
3,0.72,0.87,5,223,5,0,0,1,0,0,0,0,0,0,0,1,0,0
4,0.37,0.52,2,159,3,0,0,1,0,0,0,0,0,0,0,1,0,0


In [25]:
#train-test split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [26]:
X_train.shape

(10499, 18)

In [27]:
X_test.shape

(4500, 18)

In [28]:
X_train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,low,medium,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
3486,0.85,0.67,4,228,4,0,0,1,0,0,0,0,0,0,0,0,1,0
3950,0.59,0.69,3,186,3,0,0,1,0,0,0,0,0,0,0,0,0,1
2221,0.78,0.81,4,232,3,0,0,0,1,0,0,0,0,0,0,1,0,0
10267,0.93,0.5,2,135,3,0,0,0,0,0,0,0,0,0,0,0,0,1
547,0.37,0.5,2,141,3,0,0,1,0,0,0,0,0,0,0,1,0,0


In [29]:
y_train.shape

(10499,)

In [30]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 18 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   promotion_last_5years  14999 non-null  int64  
 7   low                    14999 non-null  uint8  
 8   medium                 14999 non-null  uint8  
 9   RandD                  14999 non-null  uint8  
 10  accounting             14999 non-null  uint8  
 11  hr                     14999 non-null  uint8  
 12  management             14999 non-null  uint8  
 13  marketing              14999 non-null  uint8  
 14  product_mng            14999 non-null  uint8  
 15  sa

### Rescaling

- StandardScaling

##### Steps:
- Importing MinMaxScaler from Sklearn.preprocessing
- Instantiating or creating object of scaler
- Apply scaler to the columns you wish to do scaling [ num_vars= ['','',''] )
- scaler.fit_transform(df[num_vars])

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
# Creating instance/object
scaler = MinMaxScaler()

In [33]:
# Applying StandardScaling() on all the data points
num_vars = ['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company']
X_train[num_vars] = scaler.fit_transform(X_train[num_vars])
X_test[num_vars] = scaler.transform(X_test[num_vars])

In [34]:
X_train.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,low,medium,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
3486,0.835165,0.484375,0.4,0.616822,0.25,0,0,1,0,0,0,0,0,0,0,0,1,0
3950,0.549451,0.515625,0.2,0.420561,0.125,0,0,1,0,0,0,0,0,0,0,0,0,1
2221,0.758242,0.703125,0.4,0.635514,0.125,0,0,0,1,0,0,0,0,0,0,1,0,0
10267,0.923077,0.21875,0.0,0.182243,0.125,0,0,0,0,0,0,0,0,0,0,0,0,1
547,0.307692,0.21875,0.0,0.21028,0.125,0,0,1,0,0,0,0,0,0,0,1,0,0


### Model Making

In [35]:
#import Logistic Regresssion from Sklearn
from sklearn.linear_model import LogisticRegression
# creating object of LR
lr = LogisticRegression()
# fit the logistic regression
model = lr.fit(X_train,y_train)

In [36]:
y_train_pred = model.predict(X_train)

In [37]:
y_test_pred = model.predict(X_test)
y_test_pred

array([1, 1, 1, ..., 0, 0, 1], dtype=int64)

In [38]:
y_test.head()

14339    1
1159     1
944      1
8224     0
13134    0
Name: Attrition, dtype: int64

In [39]:
model.score(X_test,y_test)

0.7911111111111111