In [1]:
## import required packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

# UDF
from function import metrices_calculation

In [2]:
df = pd.read_csv('Employee-Attrition.csv')
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


### columns to be dropped
    'EmployeeNumber' => this is nominal data 
    'StandardHours' => zero variance
    'EmployeeCount'=> zero variance
    'DailyRate', 'HourlyRate' & 'MonthlyRate' should be correlated (further analysis required)
    'Over18' => zero variance

In [3]:
# dropping columns
df.drop(['StandardHours','EmployeeCount','Over18'], axis = 1, inplace = True)

In [4]:
df['Attrition'] = df.Attrition.map({'Yes':1,'No':0})
df = pd.get_dummies(df) ## OHE encoding

In [5]:
df.shape

(1470, 53)

In [6]:
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,1,1102,1,2,1,2,94,3,2,...,0,0,0,1,0,0,0,1,0,1
1,49,0,279,8,1,2,3,61,2,2,...,0,0,1,0,0,0,1,0,1,0
2,37,1,1373,2,2,4,4,92,2,1,...,0,0,0,0,0,0,0,1,0,1
3,33,0,1392,3,4,5,4,56,3,1,...,0,0,1,0,0,0,1,0,0,1
4,27,0,591,2,1,7,1,40,3,1,...,0,0,0,0,0,0,1,0,1,0


In [7]:
## import sklearn functions
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, RidgeClassifierCV,RidgeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [8]:
### data splitting into features & target
X = df.drop(['EmployeeNumber','Attrition'], axis = 1)
y = df.Attrition

In [9]:
X.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,...,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,OverTime_No,OverTime_Yes
0,41,1102,1,2,2,94,3,2,4,5993,...,0,0,0,1,0,0,0,1,0,1
1,49,279,8,1,3,61,2,2,2,5130,...,0,0,1,0,0,0,1,0,1,0
2,37,1373,2,2,4,92,2,1,3,2090,...,0,0,0,0,0,0,0,1,0,1
3,33,1392,3,4,4,56,3,1,3,2909,...,0,0,1,0,0,0,1,0,0,1
4,27,591,2,1,1,40,3,1,2,3468,...,0,0,0,0,0,0,1,0,1,0


In [10]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Attrition, dtype: int64

### 
1. Why scaling is important?
2. Is scaling important always?
3. should it be done on features or target or both?

In [11]:
## scaling data
scaler = StandardScaler()
scaler.fit(X, y)
X_ = scaler.transform(X)

In [12]:
print(X_[:2])

[[ 0.4463504   0.74252653 -1.01090934 -0.89168825 -0.66053067  1.38313827
   0.37967213 -0.05778755  1.15325359 -0.10834951  0.72601994  2.12513592
  -1.1505541  -0.42623002 -1.58417824 -0.93201439 -0.42164246 -2.17198183
  -2.49382042 -0.16461311 -0.0632959  -0.67914568  0.24583399 -0.33709993
  -0.48185865  0.63984058 -0.21160368 -1.37405084  1.51524392 -0.13678823
   1.19404463 -0.34825488 -0.67914108 -0.24305927 -0.31409347  1.22474487
  -1.22474487 -0.3127846  -0.1914977  -0.46246387 -0.27305934 -0.33080804
  -0.23990406 -0.49787324  1.87328654 -0.24462499 -0.53487311 -0.91892141
   1.45864991 -1.59174553  1.59174553]
 [ 1.32236521 -1.2977746  -0.14714972 -1.86842575  0.25462493 -0.24067677
  -1.02616674 -0.05778755 -0.66085318 -0.29171859  1.48887614 -0.67804939
   2.12930601  2.34615106  1.19143799  0.24198831 -0.1645114   0.15570708
   0.33809616  0.48850773  0.76499762 -0.36871529  0.80654148 -0.33709993
   2.07529738 -1.5628893  -0.21160368  0.72777511 -0.65995975 -0.13678823

In [13]:
### splitting data into train and validation/test sets by 80:20 ratio
X_train, X_test, y_train, y_test = train_test_split(X_,y,test_size = 0.2, stratify = y, random_state = 42)
print(X_train.shape,y_train.shape )
print(X_test.shape,y_test.shape)

(1176, 51) (1176,)
(294, 51) (294,)


In [14]:
### K-fold cross validation
# 100, K=5
# S1 S2 S3 S4 S5
# 20 20 20 20 20

# 1st iter: S1-> validation set: [S2:S5]-> training set => Accuracy1 = 50
# 2nd iter: S2-> validation set: [Rem]-> training set => A2 = 90
# 3rd iter: S3-> validation set: [Rem]-> training set => A3 = 89
# ...
# 5th iter: S5-> validation set: [Rem]-> training set => A5 = 78

In [15]:
# #### limitations of deep layer
#      - pre-requisite of large dataset 
    
    
# 1. simplity (model complexity is less) 
#  - Linear/Logistic regression (pros:simple model, works well on less data ;cons: starts to perform well with high number of categorical/non-linear data, fails with outliers)
#  - Tree based models(CART, RandomForest/(GradientBoosting/XgBoost)) (Pros:works with non-linearity date,works with outliers ;cons: overfitting,highly instable)
#  - SVM (Pros:can work on mix/inseperable/non-linear/linear data; Drawbacks: computationally expensive)
#  - KNN 
#  - NaiveBased 
#  - DL (Pros: Gives highest accuracy, works with unstructure data; cons: pre-requisite of large dataset )       

In [16]:
# ### features: 10
# 10=> LR: 78; RF: 80
# 9=>  LR: 77; RF: 80 +- large_change   (78)      

In [17]:
# prunning 
# l3: 6+5+2 = 13 

In [18]:
### How to judge our accuracy??

In [19]:
model_estimators = [LogisticRegression, RidgeClassifier, RandomForestClassifier, GradientBoostingClassifier, SVC]
model_names = ['LogisticRegression','RidgeClassifier', 'RandomForest','GradientBoosting','SVC']

In [20]:
### 
def model(est, name, trainX, trainy):
    scores = cross_val_score(estimator = est(), X = trainX, y = trainy, scoring = 'accuracy', cv = 5, n_jobs  = -1)
    print(scores)
    print('{}: Average scores: {}, standard deviation: {}'.format(name, round(np.mean(scores),5),round(np.std(scores),5)))
    

In [21]:
for n,m in zip(model_names, model_estimators):
    model(m,n, X_train,y_train)

[0.86016949 0.88085106 0.89361702 0.88510638 0.89361702]
LogisticRegression: Average scores: 0.88267, standard deviation: 0.01229
[0.86016949 0.86382979 0.87234043 0.86382979 0.87234043]
RidgeClassifier: Average scores: 0.8665, standard deviation: 0.00495
[0.84745763 0.85957447 0.85106383 0.84680851 0.84255319]
RandomForest: Average scores: 0.84949, standard deviation: 0.00572
[0.84322034 0.88510638 0.86382979 0.88085106 0.84680851]
GradientBoosting: Average scores: 0.86396, standard deviation: 0.01707
[0.86016949 0.85957447 0.86808511 0.85957447 0.87659574]
SVC: Average scores: 0.8648, standard deviation: 0.00672


In [22]:
# High average score|least standard deviation

In [23]:
### 
def model_(est, name, trainX, trainy, testX, testy):
    model_ = est().fit(trainX,trainy) 
    ypred_train = model_.predict(trainX)
    ypred_test = model_.predict(testX)
#     precision, recall, fscore, = precision_recall_fscore_support(trainy, ypred_train)
    print('{}:'.format(name))
    print('Train: precision={} recall={}, Test: precision={} recall={}'.format(round(precision_score(trainy, ypred_train)*100,3),
                                                                                 round(recall_score(trainy, ypred_train)*100,3),
                                                                                  round(precision_score(testy, ypred_test)*100,3),
                                                                                  round(recall_score(testy, ypred_test)*100,3)))
          
    

In [24]:
for n,m in  zip(model_names, model_estimators):
    model_(m,n, X_train,y_train,X_test,y_test )

LogisticRegression:
Train: precision=80.672 recall=50.526, Test: precision=61.538 recall=34.043
RidgeClassifier:
Train: precision=88.0 recall=23.158, Test: precision=76.923 recall=21.277
RandomForest:
Train: precision=100.0 recall=92.105, Test: precision=60.0 recall=12.766
GradientBoosting:
Train: precision=100.0 recall=75.263, Test: precision=71.429 recall=21.277
SVC:
Train: precision=98.901 recall=47.368, Test: precision=90.909 recall=21.277


<!-- ![](attachment:image.png) -->
#### precision,recall
<img src='screenshot.jpg' style="width:500px;height:350px" />

In [25]:
#### GridSearchCV is used for hyper-parameters tunning

In [26]:
RandomForestClassifier()

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators='warn',
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

#### hyper-parameters of random forest model are: n_estimators,max_depth,min_samples_split,max_leaf_nodes,max_features


In [27]:
## model to used (assumed)
model_rf = RandomForestClassifier(oob_score=True, random_state=42, min_samples_split = 5,)

## parameters to be tunned
params = {'n_estimators':range(100,200,5),
         'max_depth':range(2,5,1)}
gs = GridSearchCV(estimator=model_rf, param_grid=params,scoring='accuracy', n_jobs=-1, cv = 3)
gs.fit(X_train, y_train)
tunned_results = pd.DataFrame(gs.cv_results_).sort_values(by = 'mean_test_score', ascending = False )

In [28]:
tunned_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
53,0.647679,0.007693,0.035699,0.000845,4,165,"{'max_depth': 4, 'n_estimators': 165}",0.857506,0.857143,0.846547,0.853741,0.005079,1
40,0.515715,0.002814,0.029017,0.008964,4,100,"{'max_depth': 4, 'n_estimators': 100}",0.857506,0.857143,0.846547,0.853741,0.005079,1
41,0.54819,0.008097,0.028707,0.004075,4,105,"{'max_depth': 4, 'n_estimators': 105}",0.857506,0.857143,0.846547,0.853741,0.005079,1
59,0.584877,0.048149,0.024595,0.004549,4,195,"{'max_depth': 4, 'n_estimators': 195}",0.854962,0.857143,0.846547,0.852891,0.004565,4
58,0.686763,0.023959,0.025776,0.002429,4,190,"{'max_depth': 4, 'n_estimators': 190}",0.854962,0.857143,0.846547,0.852891,0.004565,4
57,0.728501,0.006968,0.041871,0.004815,4,185,"{'max_depth': 4, 'n_estimators': 185}",0.854962,0.857143,0.846547,0.852891,0.004565,4
56,0.738983,0.009331,0.038997,0.001279,4,180,"{'max_depth': 4, 'n_estimators': 180}",0.854962,0.857143,0.846547,0.852891,0.004565,4
55,0.728248,0.005535,0.034139,0.001668,4,175,"{'max_depth': 4, 'n_estimators': 175}",0.854962,0.857143,0.846547,0.852891,0.004565,4
54,0.682058,0.011955,0.0443,0.004203,4,170,"{'max_depth': 4, 'n_estimators': 170}",0.854962,0.857143,0.846547,0.852891,0.004565,4
52,0.638329,0.004041,0.03277,0.001054,4,160,"{'max_depth': 4, 'n_estimators': 160}",0.857506,0.857143,0.84399,0.852891,0.006284,4


In [29]:
# gs.best_estimator_ # returns estimators with best parameters sets
# gs.best_params_  # displays best parameters values
# gs.best_score_ # display best score as per scoring parameter in GridSearchCV

#### All the above steps can be pipelined to excute in sequence once the model is decided by using PipeLine

In [30]:
from sklearn.pipeline import Pipeline

In [31]:
pipline = Pipeline(steps = [('scaler',StandardScaler()),
                 ('model',RandomForestClassifier())])
pipline.fit(X_train, y_train)
pred_tr = pipline.predict(X_train)
pred_te = pipline.predict(X_test)

In [32]:
## train metrics
metrices_calculation(y_train, pred_tr)

print('\n')

## test metrics
metrices_calculation(y_test, pred_te)

Accuracy:0.9787414965986394
 precision:1.0
 recall:0.868421052631579
 confusion matrix:[[986   0]
 [ 25 165]]


Accuracy:0.8401360544217688
 precision:0.5
 recall:0.1702127659574468
 confusion matrix:[[239   8]
 [ 39   8]]


In [33]:
pipline = Pipeline(steps = [('scaler',Normalizer()),
                 ('model',RandomForestClassifier())])
pipline.fit(X_train, y_train)
pred_tr = pipline.predict(X_train)
pred_te = pipline.predict(X_test)

In [34]:
## train metrics
metrices_calculation(y_train, pred_tr)

print('\n')

## test metrics
metrices_calculation(y_test, pred_te)

Accuracy:0.9812925170068028
 precision:1.0
 recall:0.8842105263157894
 confusion matrix:[[986   0]
 [ 22 168]]


Accuracy:0.8435374149659864
 precision:0.5384615384615384
 recall:0.14893617021276595
 confusion matrix:[[241   6]
 [ 40   7]]


In [35]:
pipline = Pipeline(steps = [('scaler',Normalizer()),
                 ('model',LogisticRegression())])
pipline.fit(X_train, y_train)
pred_tr = pipline.predict(X_train)
pred_te = pipline.predict(X_test)

In [36]:
## train metrics
metrices_calculation(y_train, pred_tr)

print('\n')

## test metrics
metrices_calculation(y_test, pred_te)

Accuracy:0.8826530612244898
 precision:0.8939393939393939
 recall:0.3105263157894737
 confusion matrix:[[979   7]
 [131  59]]


Accuracy:0.8707482993197279
 precision:0.7647058823529411
 recall:0.2765957446808511
 confusion matrix:[[243   4]
 [ 34  13]]


In [37]:
pipline = Pipeline(steps = [('scaler',Normalizer()),
                 ('model',RidgeClassifier())])
pipline.fit(X_train, y_train)
pred_tr = pipline.predict(X_train)
pred_te = pipline.predict(X_test)

In [38]:
## train metrics
metrices_calculation(y_train, pred_tr)

print('\n')

## test metrics
metrices_calculation(y_test, pred_te)

Accuracy:0.8690476190476191
 precision:0.9285714285714286
 recall:0.20526315789473684
 confusion matrix:[[983   3]
 [151  39]]


Accuracy:0.8707482993197279
 precision:0.9090909090909091
 recall:0.2127659574468085
 confusion matrix:[[246   1]
 [ 37  10]]
