In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [3]:
df = pd.read_csv('./data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [4]:
df["Over18"].value_counts()

Y    1470
Name: Over18, dtype: int64

In [5]:
df.drop(['OverTime','Over18','EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis="columns", inplace=True)
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,...,3,4,1,6,3,3,2,2,2,2


In [6]:
X = df.drop("Attrition",axis=1)
y= df["Attrition"]

In [7]:
X

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,Female,94,...,3,1,0,8,0,1,6,4,0,5
1,49,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,Male,61,...,4,4,1,10,3,3,10,7,1,7
2,37,Travel_Rarely,1373,Research & Development,2,2,Other,4,Male,92,...,3,2,0,7,3,3,0,0,0,0
3,33,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,Female,56,...,3,3,0,8,3,3,8,7,3,0
4,27,Travel_Rarely,591,Research & Development,2,1,Medical,1,Male,40,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,Travel_Frequently,884,Research & Development,23,2,Medical,3,Male,41,...,3,3,1,17,3,3,5,2,0,3
1466,39,Travel_Rarely,613,Research & Development,6,1,Medical,4,Male,42,...,3,1,1,9,5,3,7,7,1,7
1467,27,Travel_Rarely,155,Research & Development,4,3,Life Sciences,2,Male,87,...,4,2,1,6,0,3,6,2,0,3
1468,49,Travel_Frequently,1023,Sales,2,3,Medical,4,Male,63,...,3,4,0,17,3,2,9,6,0,8


In [8]:
y

0       Yes
1        No
2       Yes
3        No
4        No
       ... 
1465     No
1466     No
1467     No
1468     No
1469     No
Name: Attrition, Length: 1470, dtype: object

In [9]:
X.columns

Index(['Age', 'BusinessTravel', 'DailyRate', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EnvironmentSatisfaction', 'Gender',
       'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobRole',
       'JobSatisfaction', 'MaritalStatus', 'MonthlyIncome', 'MonthlyRate',
       'NumCompaniesWorked', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StockOptionLevel', 'TotalWorkingYears',
       'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
       'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [10]:
X.shape

(1470, 29)

In [11]:
X.select_dtypes(include="O")

Unnamed: 0,BusinessTravel,Department,EducationField,Gender,JobRole,MaritalStatus
0,Travel_Rarely,Sales,Life Sciences,Female,Sales Executive,Single
1,Travel_Frequently,Research & Development,Life Sciences,Male,Research Scientist,Married
2,Travel_Rarely,Research & Development,Other,Male,Laboratory Technician,Single
3,Travel_Frequently,Research & Development,Life Sciences,Female,Research Scientist,Married
4,Travel_Rarely,Research & Development,Medical,Male,Laboratory Technician,Married
...,...,...,...,...,...,...
1465,Travel_Frequently,Research & Development,Medical,Male,Laboratory Technician,Married
1466,Travel_Rarely,Research & Development,Medical,Male,Healthcare Representative,Married
1467,Travel_Rarely,Research & Development,Life Sciences,Male,Manufacturing Director,Married
1468,Travel_Frequently,Sales,Medical,Male,Sales Executive,Married


In [12]:
X.select_dtypes(include="O").columns

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus'],
      dtype='object')

In [13]:
categorical_features = list(X.select_dtypes(include="O").columns)
categorical_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus']

In [14]:
X.select_dtypes(include="number")

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1102,1,2,2,94,3,2,4,5993,...,3,1,0,8,0,1,6,4,0,5
1,49,279,8,1,3,61,2,2,2,5130,...,4,4,1,10,3,3,10,7,1,7
2,37,1373,2,2,4,92,2,1,3,2090,...,3,2,0,7,3,3,0,0,0,0
3,33,1392,3,4,4,56,3,1,3,2909,...,3,3,0,8,3,3,8,7,3,0
4,27,591,2,1,1,40,3,1,2,3468,...,3,4,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1465,36,884,23,2,3,41,4,2,4,2571,...,3,3,1,17,3,3,5,2,0,3
1466,39,613,6,1,4,42,2,3,1,9991,...,3,1,1,9,5,3,7,7,1,7
1467,27,155,4,3,2,87,4,2,2,6142,...,4,2,1,6,0,3,6,2,0,3
1468,49,1023,2,3,4,63,2,2,2,5390,...,3,4,0,17,3,2,9,6,0,8


In [15]:
X.select_dtypes(include="number").columns

Index(['Age', 'DailyRate', 'DistanceFromHome', 'Education',
       'EnvironmentSatisfaction', 'HourlyRate', 'JobInvolvement', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [17]:
numerical_features = list(X.select_dtypes(include="number").columns)
numerical_features

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [18]:
train_data, test_data, train_y, test_y = train_test_split(X, y,
                                                   random_state = 42, test_size = 0.30,
                                                   stratify = y)

In [19]:
X.shape

(1470, 29)

In [20]:
y.shape

(1470,)

In [21]:
train_data.shape

(1029, 29)

In [22]:
test_data.shape

(441, 29)

In [23]:
train_y.shape

(1029,)

In [24]:
test_y.shape

(441,)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=42,
                                                    stratify=y)  

In [26]:
X_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
853,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,4,3,0,1,4,3,1,1,0,0
435,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,3,4,0,15,2,4,7,6,7,7
587,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,4,2,1,9,3,3,5,2,1,4
1170,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,3,3,0,6,0,2,4,2,1,2
159,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,3,4,1,6,3,3,4,3,1,2


In [27]:
ohe_encoder = OneHotEncoder()
ord_encoder = OrdinalEncoder()

In [28]:
ct = make_column_transformer( 
            (ord_encoder,["BusinessTravel"]),
            (ohe_encoder, ["Department","EducationField", "Gender","JobRole", "MaritalStatus"]),
            remainder = 'passthrough')

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe = Pipeline([
    ('Column_Transformations', ct),
    ('Logistic Regression', LogisticRegression()),
])

In [30]:
X_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
853,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,4,3,0,1,4,3,1,1,0,0
435,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,3,4,0,15,2,4,7,6,7,7
587,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,4,2,1,9,3,3,5,2,1,4
1170,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,3,3,0,6,0,2,4,2,1,2
159,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,3,4,1,6,3,3,4,3,1,2


In [31]:
y.head()

0    Yes
1     No
2    Yes
3     No
4     No
Name: Attrition, dtype: object

In [32]:
pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
y_pred = pipe.predict(X_test)
accuracy_score(y_test, y_pred)

0.8412698412698413

In [34]:
y_pred = pipe.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred)

0.6650171298058623

In [35]:
X_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
853,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,4,3,0,1,4,3,1,1,0,0
435,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,3,4,0,15,2,4,7,6,7,7
587,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,4,2,1,9,3,3,5,2,1,4
1170,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,3,3,0,6,0,2,4,2,1,2
159,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,3,4,1,6,3,3,4,3,1,2


In [36]:
train_probas = pd.DataFrame(pipe.predict_proba(X_train))
train_probas

Unnamed: 0,0,1
0,0.657474,0.342526
1,0.868217,0.131783
2,0.859321,0.140679
3,0.834872,0.165128
4,0.812314,0.187686
...,...,...
1024,0.831317,0.168683
1025,0.911079,0.088921
1026,0.739918,0.260082
1027,0.840986,0.159014


In [37]:
train_probas.columns = ['no', 'yes']

In [38]:
train_probas

Unnamed: 0,no,yes
0,0.657474,0.342526
1,0.868217,0.131783
2,0.859321,0.140679
3,0.834872,0.165128
4,0.812314,0.187686
...,...,...
1024,0.831317,0.168683
1025,0.911079,0.088921
1026,0.739918,0.260082
1027,0.840986,0.159014


In [39]:
test_probas = pd.DataFrame(pipe.predict_proba(X_test))
test_probas.columns = ['no', 'yes']
test_probas

Unnamed: 0,no,yes
0,0.853423,0.146577
1,0.648445,0.351555
2,0.615813,0.384187
3,0.808127,0.191873
4,0.892689,0.107311
...,...,...
436,0.832132,0.167868
437,0.851662,0.148338
438,0.916526,0.083474
439,0.764707,0.235293


In [40]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
853,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,4,3,0,1,4,3,1,1,0,0
435,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,3,4,0,15,2,4,7,6,7,7
587,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,4,2,1,9,3,3,5,2,1,4
1170,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,3,3,0,6,0,2,4,2,1,2
159,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,3,4,1,6,3,3,4,3,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,46,Non-Travel,1144,Research & Development,7,4,Medical,3,Female,30,...,3,3,0,7,2,4,1,0,0,0
963,38,Travel_Rarely,1009,Sales,2,2,Life Sciences,2,Female,31,...,3,4,1,11,3,3,7,7,1,7
734,22,Travel_Rarely,217,Research & Development,8,1,Life Sciences,2,Male,94,...,3,1,1,4,3,2,4,3,1,1
1315,36,Travel_Rarely,430,Research & Development,2,4,Other,4,Female,73,...,4,4,1,15,2,3,1,0,0,0


In [41]:
y_train

853      No
435     Yes
587      No
1170     No
159      No
       ... 
365      No
963      No
734      No
1315     No
1292     No
Name: Attrition, Length: 1029, dtype: object

In [42]:
train_data = X_train.copy()
train_data.reset_index(inplace=True, drop=True)
train_data['Attrition'] = ['no' if x == 'No' else 'yes' for x in y_train]
train_data

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,3,0,1,4,3,1,1,0,0,no
1,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,4,0,15,2,4,7,6,7,7,yes
2,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,2,1,9,3,3,5,2,1,4,no
3,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,3,0,6,0,2,4,2,1,2,no
4,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,4,1,6,3,3,4,3,1,2,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1024,46,Non-Travel,1144,Research & Development,7,4,Medical,3,Female,30,...,3,0,7,2,4,1,0,0,0,no
1025,38,Travel_Rarely,1009,Sales,2,2,Life Sciences,2,Female,31,...,4,1,11,3,3,7,7,1,7,no
1026,22,Travel_Rarely,217,Research & Development,8,1,Life Sciences,2,Male,94,...,1,1,4,3,2,4,3,1,1,no
1027,36,Travel_Rarely,430,Research & Development,2,4,Other,4,Female,73,...,4,1,15,2,3,1,0,0,0,no


In [43]:
merged_train = pd.concat([train_data, train_probas], axis = 1)
merged_train.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,no,yes
0,19,Travel_Rarely,645,Research & Development,9,2,Life Sciences,3,Male,54,...,1,4,3,1,1,0,0,no,0.657474,0.342526
1,33,Travel_Rarely,1277,Research & Development,15,1,Medical,2,Male,56,...,15,2,4,7,6,7,7,yes,0.868217,0.131783
2,52,Travel_Rarely,1325,Research & Development,11,4,Life Sciences,4,Female,82,...,9,3,3,5,2,1,4,no,0.859321,0.140679
3,27,Travel_Frequently,591,Research & Development,2,3,Medical,4,Male,87,...,6,0,2,4,2,1,2,no,0.834872,0.165128
4,34,Travel_Frequently,303,Sales,2,4,Marketing,3,Female,75,...,6,3,3,4,3,1,2,no,0.812314,0.187686


In [44]:
test_data = X_test.copy()
test_data.reset_index(inplace=True, drop=True)
test_data['Attrition'] = ['no' if x == 'No' else 'yes' for x in y_test]
test_data

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
0,25,Travel_Rarely,891,Sales,4,2,Life Sciences,2,Female,99,...,2,0,5,3,3,5,4,1,3,no
1,37,Travel_Rarely,367,Research & Development,25,2,Medical,3,Female,52,...,3,2,9,2,3,6,2,1,3,no
2,35,Travel_Rarely,538,Research & Development,25,2,Other,1,Male,54,...,4,0,9,3,3,3,2,0,2,no
3,31,Travel_Rarely,688,Sales,7,3,Life Sciences,3,Male,44,...,3,1,10,3,2,5,4,0,1,no
4,34,Travel_Rarely,1480,Sales,4,3,Life Sciences,3,Male,64,...,4,3,9,3,3,5,3,1,0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,38,Travel_Frequently,1490,Research & Development,2,2,Life Sciences,4,Male,42,...,3,1,1,3,3,1,0,0,0,no
437,40,Non-Travel,1142,Research & Development,8,2,Life Sciences,4,Male,72,...,3,0,8,2,3,2,2,2,2,no
438,33,Travel_Frequently,1076,Research & Development,3,3,Life Sciences,1,Male,70,...,1,0,10,3,3,10,8,9,7,yes
439,29,Travel_Rarely,1092,Research & Development,1,4,Medical,1,Male,36,...,2,3,4,3,4,2,2,2,2,yes


In [45]:
test_data.tail()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition
436,38,Travel_Frequently,1490,Research & Development,2,2,Life Sciences,4,Male,42,...,3,1,1,3,3,1,0,0,0,no
437,40,Non-Travel,1142,Research & Development,8,2,Life Sciences,4,Male,72,...,3,0,8,2,3,2,2,2,2,no
438,33,Travel_Frequently,1076,Research & Development,3,3,Life Sciences,1,Male,70,...,1,0,10,3,3,10,8,9,7,yes
439,29,Travel_Rarely,1092,Research & Development,1,4,Medical,1,Male,36,...,2,3,4,3,4,2,2,2,2,yes
440,45,Travel_Rarely,1385,Research & Development,20,2,Medical,3,Male,79,...,2,0,21,2,3,20,7,4,10,no


In [46]:
test_probas.tail()

Unnamed: 0,no,yes
436,0.832132,0.167868
437,0.851662,0.148338
438,0.916526,0.083474
439,0.764707,0.235293
440,0.964593,0.035407


In [47]:
merged_test = pd.concat([test_data, test_probas], axis = 1)
merged_test

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,no,yes
0,25,Travel_Rarely,891,Sales,4,2,Life Sciences,2,Female,99,...,5,3,3,5,4,1,3,no,0.853423,0.146577
1,37,Travel_Rarely,367,Research & Development,25,2,Medical,3,Female,52,...,9,2,3,6,2,1,3,no,0.648445,0.351555
2,35,Travel_Rarely,538,Research & Development,25,2,Other,1,Male,54,...,9,3,3,3,2,0,2,no,0.615813,0.384187
3,31,Travel_Rarely,688,Sales,7,3,Life Sciences,3,Male,44,...,10,3,2,5,4,0,1,no,0.808127,0.191873
4,34,Travel_Rarely,1480,Sales,4,3,Life Sciences,3,Male,64,...,9,3,3,5,3,1,0,no,0.892689,0.107311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
436,38,Travel_Frequently,1490,Research & Development,2,2,Life Sciences,4,Male,42,...,1,3,3,1,0,0,0,no,0.832132,0.167868
437,40,Non-Travel,1142,Research & Development,8,2,Life Sciences,4,Male,72,...,8,2,3,2,2,2,2,no,0.851662,0.148338
438,33,Travel_Frequently,1076,Research & Development,3,3,Life Sciences,1,Male,70,...,10,3,3,10,8,9,7,yes,0.916526,0.083474
439,29,Travel_Rarely,1092,Research & Development,1,4,Medical,1,Male,36,...,4,3,4,2,2,2,2,yes,0.764707,0.235293


In [48]:
from evidently.dashboard import Dashboard
from evidently.dashboard.tabs import ClassificationPerformanceTab
from evidently.pipeline.column_mapping import ColumnMapping

In [49]:
numerical_features

['Age',
 'DailyRate',
 'DistanceFromHome',
 'Education',
 'EnvironmentSatisfaction',
 'HourlyRate',
 'JobInvolvement',
 'JobLevel',
 'JobSatisfaction',
 'MonthlyIncome',
 'MonthlyRate',
 'NumCompaniesWorked',
 'PercentSalaryHike',
 'PerformanceRating',
 'RelationshipSatisfaction',
 'StockOptionLevel',
 'TotalWorkingYears',
 'TrainingTimesLastYear',
 'WorkLifeBalance',
 'YearsAtCompany',
 'YearsInCurrentRole',
 'YearsSinceLastPromotion',
 'YearsWithCurrManager']

In [50]:
categorical_features

['BusinessTravel',
 'Department',
 'EducationField',
 'Gender',
 'JobRole',
 'MaritalStatus']

In [51]:
column_mapping = ColumnMapping()

column_mapping.target = 'Attrition'
column_mapping.prediction = ['yes','no']

column_mapping.numerical_features = numerical_features

column_mapping.categorical_features = categorical_features

In [52]:
merged_test.sample(5)

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,Gender,HourlyRate,...,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,Attrition,no,yes
137,31,Travel_Frequently,715,Sales,2,4,Other,4,Male,54,...,10,3,3,5,2,0,3,no,0.819461,0.180539
179,38,Travel_Frequently,471,Research & Development,12,3,Life Sciences,1,Male,45,...,13,3,2,4,3,1,2,no,0.82235,0.17765
431,32,Travel_Frequently,585,Research & Development,10,3,Life Sciences,1,Male,56,...,10,3,2,5,2,1,3,no,0.754041,0.245959
247,33,Travel_Rarely,832,Research & Development,5,4,Life Sciences,3,Female,63,...,2,2,2,2,2,0,2,no,0.807661,0.192339
99,37,Travel_Rarely,1192,Research & Development,5,2,Medical,4,Male,61,...,8,2,2,6,2,0,4,no,0.863389,0.136611


In [None]:
from evidently.dashboard.tabs import ProbClassificationPerformanceTab
dashboard = Dashboard(tabs=[ProbClassificationPerformanceTab()])
dashboard.calculate(merged_train, merged_test, column_mapping = column_mapping)

In [None]:
dashboard.save('pipeline_ibm_hr_attrition_baseline_performance.html')

## 2nd approach (Without using Column_Transformer and Pipeline)

In [None]:
df = pd.read_csv('./data/WA_Fn-UseC_-HR-Employee-Attrition.csv')
df.head()

In [None]:
df.drop(['Over18','EmployeeCount', 'EmployeeNumber', 'StandardHours'], axis="columns", inplace=True)
df["Attrition"] = df["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)

In [None]:
df2 = df.copy(deep = True)

X = df2.drop("Attrition",axis=1)
y= df2["Attrition"]

categorical_features = list(X.select_dtypes(include="O").columns)
numerical_features = list(X.select_dtypes(include="number").columns)

target_name = 'Attrition'

X['Gender_Indicator'] = X.Gender.apply(
lambda x : 0 if x == 'Male' else 1 if x == 'Female' else -1)

business_travel_dummies = pd.get_dummies(X.BusinessTravel, prefix = 'b_travel')
X = pd.concat([X, business_travel_dummies], axis=1)

department_dummies = pd.get_dummies(X.Department, prefix = 'department')
X = pd.concat([X, department_dummies], axis=1)

edu_field_dummies = pd.get_dummies(X.Department, prefix = 'edu_field')
X = pd.concat([X, edu_field_dummies], axis=1)

job_role_dummies = pd.get_dummies(X.JobRole, prefix = 'job_role')
X = pd.concat([X, job_role_dummies], axis=1)

marital_dummies = pd.get_dummies(X.MaritalStatus, prefix = 'marital')
X = pd.concat([X, marital_dummies], axis=1) 

overtime_dummies = pd.get_dummies(X.OverTime, prefix = 'overtime')
X = pd.concat([X, overtime_dummies], axis=1)

X.drop(columns = categorical_features, inplace = True)



X_train, X_test, y_train, y_test = train_test_split(X, y,
                               random_state = 42, test_size = 0.30,
                               stratify = y)


In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, random_state = 42)

In [None]:
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
accuracy_Score = accuracy_score(y_test, y_pred)
print(accuracy_Score)

In [None]:
y_pred = rf.predict_proba(X_test)[:,1]
ROC_AUC_Score = roc_auc_score(y_test, y_pred)
print(ROC_AUC_Score)

### Performance Dashboad

In [None]:
train_probas = pd.DataFrame(rf.predict_proba(X_train))
train_probas.columns = ['no', 'yes']
train_probas

In [None]:
test_probas = pd.DataFrame(rf.predict_proba(X_test))
test_probas.columns = ['no', 'yes'] 
test_probas

In [None]:
X_train

In [None]:
X_train.reset_index(inplace=True, drop=True)
X_train

In [None]:
y_train

In [None]:
X_train['Attrition'] = ['no' if x == 0 else 'yes' for x in y_train]

In [None]:
train_merged = pd.concat([X_train, train_probas], axis = 1)
train_merged.head()

In [None]:
X_test.reset_index(inplace=True, drop=True)

In [None]:
X_test['Attrition'] = ['no' if x == 0 else 'yes' for x in y_test]

In [None]:
X_test.head()

In [None]:
test_merged = pd.concat([X_test, test_probas], axis = 1)
test_merged.head()

In [None]:
train_merged

In [None]:
column_mapping = ColumnMapping()

column_mapping.target = 'Attrition'
column_mapping.prediction = ['yes', 'no']

column_mapping.numerical_features = ['Age','DailyRate', 'DistanceFromHome', 'Education',
        'EnvironmentSatisfaction', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobSatisfaction', 'MonthlyIncome',
       'MonthlyRate', 'NumCompaniesWorked', 'PercentSalaryHike',
       'PerformanceRating', 'RelationshipSatisfaction', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager']

column_mapping.categorical_features = ['b_travel_Non-Travel',
       'b_travel_Travel_Frequently', 'b_travel_Travel_Rarely',
       'department_Human Resources', 'department_Research & Development',
       'department_Sales', 'edu_field_Human Resources',
       'edu_field_Research & Development', 'edu_field_Sales', 'Gender_Indicator',
       'job_role_Healthcare Representative', 'job_role_Human Resources',
       'job_role_Laboratory Technician', 'job_role_Manager',
       'job_role_Manufacturing Director', 'job_role_Research Director',
       'job_role_Research Scientist', 'job_role_Sales Executive',
       'job_role_Sales Representative', 'marital_Divorced', 'marital_Married',
       'marital_Single', 'overtime_No', 'overtime_Yes']

In [None]:
from evidently.dashboard.tabs import ProbClassificationPerformanceTab

In [None]:
dashboard = Dashboard(tabs=[ProbClassificationPerformanceTab()])
dashboard.calculate(train_merged, test_merged, column_mapping = column_mapping)

dashboard.save('ibm_hr_attrition_baseline_performance.html')

In [None]:
# dashboard.show()