In [86]:
import numpy as np
import pandas as pd

## Reading training data

In [87]:
attrition = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')

In [88]:
attrition.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


### Checking for null values

In [89]:
print(attrition.isnull().any().any())

False


In [90]:
print(attrition['Attrition'].head(3).values)

['Yes' 'No' 'Yes']


### Creating speacial variable to store categorical and numerical values

In [91]:
# Empty list to store columns with categorical data
categorical = []
for col, value in attrition.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = attrition.columns.difference(categorical)

In [92]:
print(numerical)

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')


In [93]:
print(categorical)

['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole', 'MaritalStatus', 'Over18', 'OverTime']


In [94]:
attrition_cat = attrition[categorical].drop(['Attrition'], axis=1)
attrition_num = attrition[numerical]

In [95]:
attrition_cat.head()

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus,Over18,OverTime
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single,Y,Yes
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married,Y,No
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single,Y,Yes
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married,Y,Yes
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married,Y,No


In [113]:
attrition_num.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,1,1,2,94,3,2,...,1,80,0,8,0,1,6,4,0,5
1,49,279,8,1,1,2,3,61,2,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1373,2,2,1,4,4,92,2,1,...,2,80,0,7,3,3,0,0,0,0
3,33,1392,3,4,1,5,4,56,3,1,...,3,80,0,8,3,3,8,7,3,0
4,27,591,2,1,1,7,1,40,3,1,...,4,80,1,6,3,3,2,2,2,2


### Encoding Categorical Features

In [97]:
attrition_cat = pd.get_dummies(attrition_cat)

In [98]:
attrition_cat.head()

Unnamed: 0,BusinessTravel_Non-Travel,BusinessTravel_Travel_Frequently,BusinessTravel_Travel_Rarely,Department_Human Resources,Department_Research & Development,Department_Sales,EducationField_Human Resources,EducationField_Life Sciences,EducationField_Marketing,EducationField_Medical,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,0,0,1,0,0,1,0,1,0,0,...,0,0,1,0,0,0,1,1,0,1
1,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
2,0,0,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
3,0,1,0,0,1,0,0,1,0,0,...,0,1,0,0,0,1,0,1,0,1
4,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,1,0,1,1,0


### Final features and target

In [99]:
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)

target_map = {'Yes':1, 'No':0}
target = attrition["Attrition"].apply(lambda x: target_map[x])

In [100]:
attrition_final.head(3)

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,1102,1,2,1,1,2,94,3,2,...,0,0,1,0,0,0,1,1,0,1
1,49,279,8,1,1,2,3,61,2,2,...,0,1,0,0,0,1,0,1,1,0
2,37,1373,2,2,1,4,4,92,2,1,...,0,0,0,0,0,0,1,1,0,1


In [101]:
target.head(3)

0    1
1    0
2    1
Name: Attrition, dtype: int64

### Checking for ratio of target variables in training data

In [114]:
values = target.values
count_0 = count_1 = 0
for val in values:
    if(val==0):
        count_0 = count_0 + 1
    else:
        count_1 = count_1 + 1
print("Number of No:",count_0)
print("Number of Yes:",count_1)
print("Total Number:",count_0+count_1)

Number of No: 1233
Number of Yes: 237
Total Number: 1470


### Splitting dataset

In [103]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(attrition_final,target,test_size=0.2)

### Using both Over and under sampling for balancing dataset using SMOTETomek

In [104]:
from imblearn.combine import SMOTETomek
smk = SMOTETomek()
smote_train, smote_target = smk.fit_sample(X_train,y_train)

In [119]:
print("No of Train records before sampling:",X_train.shape,y_train.shape)
print("No of Train records after sampling:",smote_train.shape,smote_target.shape)

from collections import Counter
print('Original dataset shape : {}'.format(Counter(y_train)))
print('Resampled dataset shape : {}'.format(Counter(smote_target)))

No of Train records before sampling: (1176, 55) (1176,)
No of Train records after sampling: (1860, 55) (1860,)
Original dataset shape : Counter({0: 982, 1: 194})
Resampled dataset shape : Counter({0: 930, 1: 930})


### Predicting using Naive Bayes

In [149]:
from sklearn.naive_bayes import GaussianNB
rf = GaussianNB()
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest finished")

Fitting of Random Forest finished


In [150]:
rf_predictions = rf.predict(X_test)
print("Predictions finished")

Predictions finished


In [156]:
from sklearn.metrics import accuracy_score
print("Accuracy score: {}".format(accuracy_score(y_test, rf_predictions)))
print("="*80)
#print(classification_report(target_val, rf_predictions))
print(rf_predictions)
print(y_test)

Accuracy score: 0.7074829931972789
[1 1 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 0 1 0 0 1 0
 0 1 1 1 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0
 0 0 0 0 0 1 1 1 1 0 1 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0
 0 1 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 1 0 0 0 0 1 0 0 1 1 0 0 0 1 1 0 0 0 0 0
 0 0 0 1 0 1 0 0 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 0 1 0
 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 1
 1 0 1 0 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0 0 1 1 1 0 0 0 0 1 0 0 0 1 0 1]
1325    0
673     0
1316    0
151     0
962     0
       ..
407     0
462     0
128     0
538     0
1108    0
Name: Attrition, Length: 294, dtype: int64


## Predicting using StackingClassifier

In [109]:
from sklearn.ensemble import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

estimators = [
    ('rf', RandomForestClassifier(n_estimators=10)),
    ('gnb', GaussianNB())
]
stk_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
stk_classifier.fit(X_train, y_train)
print("Stack Classifier is trained")

Stack Classifier is trained


In [110]:
stack_predictions = stk_classifier.predict(X_test)
print("Stack Classifier Predictions are finished")

Stack Classifier Predictions are finished


In [111]:
print("Accuracy score: {}".format(accuracy_score(y_test, stack_predictions)))
print("="*80)
#print(classification_report(target_val, rf_predictions))
print(stack_predictions)
print(y_test)

Accuracy score: 0.8741496598639455
[0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
1325    0
673     0
1316    0
151     0
962     0
       ..
407     0
462     0
128     0
538     0
1108    0
Name: Attrition, Length: 294, dtype: int64
