<h2><center>IBM Employee Attrition ML Project</center></h2>
<h2><center> Feature Engineering & Model Training</center></h2>
<h4><center>Author: Akshay Pandurang Paunikar</center></h4>

In [55]:
# import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [56]:
# # set working directory
# import io
# %cd "dataset/"

In [57]:
# Import the dataset
data = pd.read_csv("dataset/IBM_HR_DATA.csv")

In [58]:
# check the first five records
# data = data.drop(["Unnamed: 0"], axis=1)
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,EducationField,EnvironmentSatisfaction,Gender,JobLevel,JobRole,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,Sales,1,Life Sciences,2,Female,2,Sales Executive,...,3,1,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,Research & Development,8,Life Sciences,3,Male,2,Research Scientist,...,4,4,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,Research & Development,2,Other,4,Male,1,Laboratory Technician,...,3,2,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,Research & Development,3,Life Sciences,4,Female,1,Research Scientist,...,3,3,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,Research & Development,2,Medical,1,Male,1,Laboratory Technician,...,3,4,1,6,3,3,2,2,2,2


In [59]:
# shape of the data
data.shape

(1470, 26)

In [60]:
# create an instance of Label Encoder, One Hot Encode, Standard Scaler
le = LabelEncoder()
one_hot = OneHotEncoder()
ss = StandardScaler()

In [61]:
# split data into independent features and target variable
X = data.drop(['Attrition'], axis=1)
y = data['Attrition']

In [62]:
# numerical features and categorical features
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

In [63]:
num_features

Index(['Age', 'DistanceFromHome', 'EnvironmentSatisfaction', 'JobLevel',
       'JobSatisfaction', 'MonthlyIncome', 'NumCompaniesWorked',
       'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'WorkLifeBalance', 'YearsAtCompany', 'YearsInCurrentRole',
       'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [64]:
cat_features

Index(['BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobRole',
       'MaritalStatus', 'OverTime'],
      dtype='object')

In [65]:
# create pipelines for categorical and numerical data
num_pipeline = Pipeline(
    steps=[
        ("Scaler", StandardScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ("one hot", OneHotEncoder()),
        ("Scaler", StandardScaler(with_mean=False))
    ]
)

In [66]:
# creating preprocessor object
preprocessor = ColumnTransformer([
    ("num_pipeline", num_pipeline, num_features),
    ("cat_pipeline", cat_pipeline, cat_features)
])

In [67]:
# applying preprocesing object to features
X = preprocessor.fit_transform(X)

In [68]:
# Label encode target variable
y = le.fit_transform(y)

In [69]:
# # divide the data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=100)

print("X_train:", X_train.shape)
print("y_train:", y_train.shape)
print("X_test:", X_test.shape)
print("y_test:", y_test.shape)

X_train: (1029, 46)
y_train: (1029,)
X_test: (441, 46)
y_test: (441,)


### Model Training

In [70]:
# Import required libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [71]:
# Create an Evaluate Function to give all metrics after model Training
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    confusionmatrix = confusion_matrix(true, predicted)    
    classificationreport = classification_report(true, predicted)
    return accuracy, confusionmatrix, classificationreport

In [72]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'Gradient Boosting Classifier': GradientBoostingClassifier(),
    'AdaBoost Classifier': AdaBoostClassifier(),
    'Support Vector Classifier': SVC(),
    'Gaussian Naive Bayes': GaussianNB(),
    'K-Neighbors Classifier': KNeighborsClassifier(),
    'CatBoost Classifier': CatBoostClassifier(verbose=False),
    'XGBoost Classifier': XGBClassifier()
}

model_list = []
accuracy_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    train_accuracy, train_confusionmatrix, train_classificationreport = evaluate_model(y_train,y_train_pred)

    test_accuracy, test_confusionmatrix, test_classificationreport = evaluate_model(y_test, y_test_pred)
        
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("**Accuracy Score:", train_accuracy)
    print("**Confusion Matrix: \n", train_confusionmatrix)
    print("**Classification Report: \n", train_classificationreport)

    print('-'*35)
    
    print('Model performance for Test set')
    print("**Accuracy Score:", test_accuracy)
    print("**Confusion Matrix: \n", test_confusionmatrix)
    print("**Classification Report: \n", test_classificationreport)
    
    accuracy_list.append(test_accuracy)
    
    print('='*35)
    print('\n')

Logistic Regression
Model performance for Training set
**Accuracy Score: 0.8969873663751214
**Confusion Matrix: 
 [[842  20]
 [ 86  81]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.91      0.98      0.94       862
           1       0.80      0.49      0.60       167

    accuracy                           0.90      1029
   macro avg       0.85      0.73      0.77      1029
weighted avg       0.89      0.90      0.89      1029

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.8662131519274376
**Confusion Matrix: 
 [[355  16]
 [ 43  27]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.96      0.92       371
           1       0.63      0.39      0.48        70

    accuracy                           0.87       441
   macro avg       0.76      0.67      0.70       441
weighted avg       0.85      0.87      0.85       441



Deci

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


CatBoost Classifier
Model performance for Training set
**Accuracy Score: 0.9902818270165209
**Confusion Matrix: 
 [[862   0]
 [ 10 157]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       862
           1       1.00      0.94      0.97       167

    accuracy                           0.99      1029
   macro avg       0.99      0.97      0.98      1029
weighted avg       0.99      0.99      0.99      1029

-----------------------------------
Model performance for Test set
**Accuracy Score: 0.8616780045351474
**Confusion Matrix: 
 [[364   7]
 [ 54  16]]
**Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.98      0.92       371
           1       0.70      0.23      0.34        70

    accuracy                           0.86       441
   macro avg       0.78      0.60      0.63       441
weighted avg       0.84      0.86      0.83       441



XGBo

In [73]:
# Results
pd.DataFrame(list(zip(model_list, accuracy_list)), columns=['Model Name', 'Accuracy Score']).sort_values(by=["Accuracy Score"],
                                                                                                         ascending=False)

Unnamed: 0,Model Name,Accuracy Score
4,AdaBoost Classifier,0.879819
0,Logistic Regression,0.866213
8,CatBoost Classifier,0.861678
2,Random Forest Classifier,0.85941
3,Gradient Boosting Classifier,0.85941
9,XGBoost Classifier,0.85941
7,K-Neighbors Classifier,0.857143
5,Support Vector Classifier,0.854875
1,Decision Tree Classifier,0.78458
6,Gaussian Naive Bayes,0.637188


In [74]:
# AdaBoost Classifier
model_adaboost = AdaBoostClassifier()

In [75]:
# fit training data
model_adaboost.fit(X_train, y_train)
model_adaboost.score(X_train, y_train)

0.9086491739552964

In [76]:
# make predictions on test data
predictions = model_adaboost.predict(X_test)

In [77]:
# performance metrics
print("Accuracy Score:", accuracy_score(y_test, predictions).round(4)*100)
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

Accuracy Score: 87.98
Confusion Matrix:
 [[359  12]
 [ 41  29]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93       371
           1       0.71      0.41      0.52        70

    accuracy                           0.88       441
   macro avg       0.80      0.69      0.73       441
weighted avg       0.87      0.88      0.87       441



In [78]:
# Difference between Actual and Predicted Values
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':predictions})
pred_df

Unnamed: 0,Actual Value,Predicted Value
0,0,0
1,0,0
2,0,0
3,0,0
4,0,1
...,...,...
436,0,0
437,0,0
438,0,0
439,1,0
