# Predicting employee attrition rate in organizations

## Using PyCaret

### Step 1: Importing the data 

Import required libraries along with PyCaret.

In [1]:
import numpy as np
import pandas as pd
from pycaret.regression import *

Download the dataset from the Hacker Earth website using the link provided in README file. Create a folder named 'dataset' in the root folder of this project and copy the train and test csv files.

In [2]:
train_csv = '../dataset/Train.csv'
test_csv = '../dataset/Test.csv'
train_data = pd.read_csv(train_csv)
test_data = pd.read_csv(test_csv)

### Step 2: Setting up the configuration

Setup the training configuration and press 'Enter' when prompted. PyCaret completes the data imputation and normalization for you.

In [3]:
# specify the target column and features to be ignored
reg = setup(train_data, target='Attrition_rate', ignore_features=['Employee_ID'])

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,7053
1,Transform Target,False
2,Transform Target Method,
3,Original Data,"(7000, 24)"
4,Missing Values,True
5,Numeric Features,5
6,Categorical Features,18
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


### Step 3: Tuning the models

Train multiple models at once using the below function. As the target is a continuous variable, PyCaret automatically selects all the regression models available and trains them using cross-validation. It also compares different metrics and sorts the models in best to worst order based on the error.

In [4]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Lasso Regression,0.1257,0.0343,0.1849,-0.0009,0.1384,5.1507
1,Elastic Net,0.1257,0.0343,0.1849,-0.0009,0.1384,5.1507
2,Lasso Least Angle Regression,0.1257,0.0343,0.1849,-0.0009,0.1384,5.1507
3,Bayesian Ridge,0.1259,0.0343,0.185,-0.0016,0.1384,5.1296
4,Orthogonal Matching Pursuit,0.1264,0.0345,0.1854,-0.0069,0.1389,5.1499
5,TheilSen Regressor,0.1266,0.0348,0.1861,-0.0147,0.1394,5.0954
6,Ridge Regression,0.1276,0.0348,0.1862,-0.0153,0.1396,5.1737
7,Least Angle Regression,0.1276,0.0348,0.1862,-0.0153,0.1396,5.1746
8,Linear Regression,0.1276,0.0348,0.1862,-0.0154,0.1396,5.1748
9,Random Sample Consensus,0.1238,0.0353,0.1875,-0.0293,0.1399,4.6726


### Step 4: Selecting a model

Select the best algorithm based on the results from the above step and create a final model.

In [5]:
# select bayesian ridge regression model
model = create_model('br')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.14,0.0429,0.207,-0.0074,0.1523,3.9775
1,0.1152,0.0275,0.1658,-0.0057,0.1266,3.9697
2,0.1291,0.0361,0.19,0.0008,0.1412,3.6543
3,0.1316,0.0386,0.1963,0.0007,0.1455,10.0658
4,0.1216,0.0323,0.1798,0.0002,0.1345,7.7441
5,0.1237,0.033,0.1818,-0.0009,0.1366,4.1306
6,0.1229,0.0324,0.1801,0.0027,0.136,3.6313
7,0.1281,0.0366,0.1913,0.0002,0.1418,3.5683
8,0.1232,0.0337,0.1837,-0.0005,0.1369,7.0616
9,0.1238,0.0303,0.174,-0.0064,0.133,3.4927


In [6]:
# display the model parameters
print(model)

BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)


### Step 5: Predicting on test data

Perform predictions using the final model selected in above step.

In [7]:
predictions = predict_model(model, data = test_data)

In [8]:
predictions

Unnamed: 0,Employee_ID,Gender,Age,Education_Level,Relationship_Status,Hometown,Unit,Decision_skill_possess,Time_of_service,Time_since_promotion,...,Compensation_and_Benefits,Work_Life_balance,VAR1,VAR2,VAR3,VAR4,VAR5,VAR6,VAR7,Label
0,EID_22713,F,32.0,5,Single,Springfield,R&D,Conceptual,7.0,4,...,type2,1.0,3,-0.9612,-0.4537,2.0,1,8,4,0.1879
1,EID_9658,M,65.0,2,Single,Lebanon,IT,Directive,41.0,2,...,type2,1.0,4,-0.9612,0.7075,1.0,2,8,2,0.1882
2,EID_22203,M,52.0,3,Married,Springfield,Sales,Directive,21.0,3,...,type3,1.0,4,-0.1048,0.7075,2.0,1,9,3,0.1794
3,EID_7652,M,50.0,5,Single,Washington,Marketing,Analytical,11.0,4,...,type0,4.0,3,-0.1048,0.7075,2.0,2,8,3,0.1819
4,EID_6516,F,44.0,3,Married,Franklin,R&D,Conceptual,12.0,4,...,type2,4.0,4,1.6081,0.7075,2.0,2,7,4,0.1883
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,EID_22547,F,32.0,3,Single,Franklin,Sales,Directive,3.0,3,...,type2,2.0,1,-0.1048,0.7075,1.0,4,7,4,0.1889
2996,EID_10066,F,,2,Single,Franklin,Marketing,Conceptual,10.0,2,...,type2,1.0,3,0.7516,0.7075,3.0,4,8,3,0.1894
2997,EID_7126,F,60.0,4,Single,Lebanon,Logistics,Analytical,35.0,1,...,type3,3.0,3,-1.8176,-0.4537,,4,8,4,0.1877
2998,EID_4929,F,51.0,1,Married,Springfield,IT,Behavioral,23.0,1,...,type3,1.0,5,0.7516,-0.4537,2.0,3,9,2,0.1804


In [9]:
predictions.rename(columns={"Label": "Attrition_rate"}, inplace=True)

Save the predictions.

In [10]:
predictions[['Employee_ID', 'Attrition_rate']].to_csv('../predictions.csv', index=False)