# ML models
In this notebook I'll explore some ML algorithms to predict if an employee will stay or leave the company.

- **[Logistic Regression (Logit)](#logit)**
- **XGBoost** 
- **Light GBM**
- **CAT Boost**
- **Support Vectorial Machines**
- **Neural Networks** 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.preprocessing import  StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import lightgbm as lgbm
import catboost as cat
from sklearn import svm
from sklearn import metrics
import tensorflow as tf
# from tensorflow.keras.layers import Input, Dense
# from tensorflow.keras.models import Model
# from tensorflow.keras.optimizers import SGD, Adam

In [93]:
# import datasets
train = pd.read_csv('data.example.csv')
production = pd.read_csv('current_employees.example.csv')

In [94]:
# # adjust data
train = train.drop(columns=['employee_id', 'termination_reason'], axis=1)

# split train set
x = train.drop(columns=['stayed'], axis=1)
y = train['stayed']

# Split dataset considering train and test must have default class
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101, stratify=y)

## Logit <a name="logit"></a>

In [96]:
# Create instance for Logit model
logit = LogisticRegression()

# Fit model in my training set
logit.fit(x_train, y_train)

# Predict y_test
logit_predict = logit.predict(x_test)

In [97]:
# Compare results
logit_matrix = metrics.confusion_matrix(y_test, logit_predict)
print('Confusion Matrix','\n',logit_matrix, '\n')

print(metrics.classification_report(y_test, logit_predict))

print('AUC: ',round(metrics.roc_auc_score(y_test, logit_predict),3))

Confusion Matrix 
 [[3 4]
 [3 5]] 

              precision    recall  f1-score   support

           0       0.50      0.43      0.46         7
           1       0.56      0.62      0.59         8

    accuracy                           0.53        15
   macro avg       0.53      0.53      0.52        15
weighted avg       0.53      0.53      0.53        15

AUC:  0.527


In [98]:
# Extract classification report
metrics_report = metrics.classification_report(y_test, logit_predict, output_dict=True)
metrics_report = pd.DataFrame(metrics_report).round(2).transpose()
metrics_report['Model'] = 'logit'
metrics_report

Unnamed: 0,precision,recall,f1-score,support,Model
0,0.5,0.43,0.46,7.0,logit
1,0.56,0.62,0.59,8.0,logit
accuracy,0.53,0.53,0.53,0.53,logit
macro avg,0.53,0.53,0.52,15.0,logit
weighted avg,0.53,0.53,0.53,15.0,logit


In [99]:
# extract AUC metric
logit_auc = round(metrics.roc_auc_score(y_test, logit_predict),3)
auc_report = pd.DataFrame({'Model': ['Logistic Regression'], 'AUC': [logit_auc]})
auc_report

Unnamed: 0,Model,AUC
0,Logistic Regression,0.527


## XGBoost

In [100]:
# Create XGBoost instance
XGB = xgb.XGBClassifier()

# Fit he model
XGB.fit(x_train, y_train)

# Pedict
xgb_predict = XGB.predict(x_test)

In [101]:
# Analyze performance
print('Confusion matrix', '\n',metrics.confusion_matrix(y_test, xgb_predict), '\n')

print(metrics.classification_report(y_test, xgb_predict))

print('AUC: ',round(metrics.roc_auc_score(y_test, xgb_predict),3))


Confusion matrix 
 [[2 5]
 [3 5]] 

              precision    recall  f1-score   support

           0       0.40      0.29      0.33         7
           1       0.50      0.62      0.56         8

    accuracy                           0.47        15
   macro avg       0.45      0.46      0.44        15
weighted avg       0.45      0.47      0.45        15

AUC:  0.455


In [102]:
# Create temporary class report 
temp_metrics_report = metrics.classification_report(y_test, xgb_predict, output_dict=True)
temp_metrics_report = pd.DataFrame(temp_metrics_report).round(2).transpose()
temp_metrics_report['Model'] = 'xgb IP'

# Concat with main df
metrics_report = pd.concat([metrics_report, temp_metrics_report], axis=0)

In [103]:
# extract AUC metric
xgb_auc = round(metrics.roc_auc_score(y_test, xgb_predict),3)
temp_auc_report = pd.DataFrame({'Model': ['XGBoost'], 'AUC': [xgb_auc]})

# Concat with main df
auc_report = pd.concat([auc_report, temp_auc_report], axis=0)
auc_report

Unnamed: 0,Model,AUC
0,Logistic Regression,0.527
0,XGBoost,0.455


## Light GBM

In [104]:
# Create instance
lgb = lgbm.LGBMClassifier()

# Fit in train set
lgb.fit(x_train, y_train)

# Predict
lgb_predict = lgb.predict(x_test)

[LightGBM] [Info] Number of positive: 18, number of negative: 16
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 0
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529412 -> initscore=0.117783
[LightGBM] [Info] Start training from score 0.117783


In [105]:
# Confusion matrix
print('Confusion Matrix:', '\n',metrics.confusion_matrix(y_test,lgb_predict), '\n')

# Classification report
print(metrics.classification_report(y_test, lgb_predict))

print('AUC: ',round(metrics.roc_auc_score(y_test, lgb_predict),3))


Confusion Matrix: 
 [[0 7]
 [0 8]] 

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         7
           1       0.53      1.00      0.70         8

    accuracy                           0.53        15
   macro avg       0.27      0.50      0.35        15
weighted avg       0.28      0.53      0.37        15

AUC:  0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [106]:
# Create temporary class report 
temp_metrics_report = metrics.classification_report(y_test, lgb_predict, output_dict=True)
temp_metrics_report = pd.DataFrame(temp_metrics_report).round(2).transpose()
temp_metrics_report['Model'] = 'Light GBM'

# Concat with main df
metrics_report = pd.concat([metrics_report, temp_metrics_report], axis=0)

# extract AUC metric
lgb_auc = round(metrics.roc_auc_score(y_test, lgb_predict),3)
temp_auc_report = pd.DataFrame({'Model': ['Light GBM'], 'AUC': [lgb_auc]})

# Concat with main df
auc_report = pd.concat([auc_report, temp_auc_report], axis=0)
auc_report

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Model,AUC
0,Logistic Regression,0.527
0,XGBoost,0.455
0,Light GBM,0.5


## CAT

In [107]:
# Create instance
cat = cat.CatBoostClassifier()

# Fit in train set
cat.fit(x_train, y_train)

# Predict
cat_predict = cat.predict(x_test)

Learning rate set to 0.002432
0:	learn: 0.6906692	total: 5.52ms	remaining: 5.52s
1:	learn: 0.6889549	total: 10.6ms	remaining: 5.31s
2:	learn: 0.6871822	total: 15.6ms	remaining: 5.19s
3:	learn: 0.6849942	total: 21.8ms	remaining: 5.44s
4:	learn: 0.6830751	total: 27.3ms	remaining: 5.43s
5:	learn: 0.6812971	total: 30.8ms	remaining: 5.11s
6:	learn: 0.6796870	total: 34.2ms	remaining: 4.86s
7:	learn: 0.6778329	total: 37.7ms	remaining: 4.67s
8:	learn: 0.6758775	total: 40.2ms	remaining: 4.42s
9:	learn: 0.6737074	total: 42.3ms	remaining: 4.18s
10:	learn: 0.6711565	total: 44.3ms	remaining: 3.98s
11:	learn: 0.6695534	total: 46.3ms	remaining: 3.81s
12:	learn: 0.6673747	total: 48.3ms	remaining: 3.67s
13:	learn: 0.6653380	total: 50.4ms	remaining: 3.55s
14:	learn: 0.6627248	total: 52.1ms	remaining: 3.42s
15:	learn: 0.6611137	total: 53.8ms	remaining: 3.31s
16:	learn: 0.6594210	total: 55.2ms	remaining: 3.19s
17:	learn: 0.6575363	total: 56.5ms	remaining: 3.08s
18:	learn: 0.6554801	total: 57.8ms	remaining

In [108]:
# Confusion matrix
print('Confusion Matrix:', '\n',metrics.confusion_matrix(y_test,cat_predict), '\n')

# Classification report
print(metrics.classification_report(y_test, cat_predict))

print('AUC: ',round(metrics.roc_auc_score(y_test, cat_predict),3))

Confusion Matrix: 
 [[3 4]
 [3 5]] 

              precision    recall  f1-score   support

           0       0.50      0.43      0.46         7
           1       0.56      0.62      0.59         8

    accuracy                           0.53        15
   macro avg       0.53      0.53      0.52        15
weighted avg       0.53      0.53      0.53        15

AUC:  0.527


In [109]:
# Create temporary class report 
temp_metrics_report = metrics.classification_report(y_test, cat_predict, output_dict=True)
temp_metrics_report = pd.DataFrame(temp_metrics_report).round(2).transpose()
temp_metrics_report['Model'] = 'CAT'

# Concat with main df
metrics_report = pd.concat([metrics_report, temp_metrics_report], axis=0)

# extract AUC metric
cat_auc = round(metrics.roc_auc_score(y_test, cat_predict),3)
temp_auc_report = pd.DataFrame({'Model': ['CAT Boost'], 'AUC': [cat_auc]})

# Concat with main df
auc_report = pd.concat([auc_report, temp_auc_report], axis=0)
auc_report

Unnamed: 0,Model,AUC
0,Logistic Regression,0.527
0,XGBoost,0.455
0,Light GBM,0.5
0,CAT Boost,0.527


## SVM

In [110]:
# Instance model
svc = svm.SVC()

# Fit
svc.fit(x_train, y_train)

# Predict
svc_predict = svc.predict(x_test)

In [111]:
# See results
print('Confusion matrix', '\n', metrics.confusion_matrix(y_test, svc_predict))

# Classification report
print(metrics.classification_report(y_test, svc_predict))

print('AUC: ',round(metrics.roc_auc_score(y_test, svc_predict),3))


Confusion matrix 
 [[2 5]
 [2 6]]
              precision    recall  f1-score   support

           0       0.50      0.29      0.36         7
           1       0.55      0.75      0.63         8

    accuracy                           0.53        15
   macro avg       0.52      0.52      0.50        15
weighted avg       0.52      0.53      0.51        15

AUC:  0.518


In [112]:
# Create temporary class report 
temp_metrics_report = metrics.classification_report(y_test, svc_predict, output_dict=True)
temp_metrics_report = pd.DataFrame(temp_metrics_report).round(2).transpose()
temp_metrics_report['Model'] = 'SVM'

# Concat with main df
metrics_report = pd.concat([metrics_report, temp_metrics_report], axis=0)

# extract AUC metric
svm_auc = round(metrics.roc_auc_score(y_test, svc_predict),3)
temp_auc_report = pd.DataFrame({'Model': ['SVM'], 'AUC': [svm_auc]})

# Concat with main df
auc_report = pd.concat([auc_report, temp_auc_report], axis=0)
auc_report

Unnamed: 0,Model,AUC
0,Logistic Regression,0.527
0,XGBoost,0.455
0,Light GBM,0.5
0,CAT Boost,0.527
0,SVM,0.518


# Results

In [114]:
auc_report

Unnamed: 0,Model,AUC
0,Logistic Regression,0.527
0,XGBoost,0.455
0,Light GBM,0.5
0,CAT Boost,0.527
0,SVM,0.518


In [126]:
metrics_report[metrics_report['Model'].isin(['CAT', 'logit'])]

Unnamed: 0,precision,recall,f1-score,support,Model
0,0.5,0.43,0.46,7.0,logit
1,0.56,0.62,0.59,8.0,logit
accuracy,0.53,0.53,0.53,0.53,logit
macro avg,0.53,0.53,0.52,15.0,logit
weighted avg,0.53,0.53,0.53,15.0,logit
0,0.5,0.43,0.46,7.0,CAT
1,0.56,0.62,0.59,8.0,CAT
accuracy,0.53,0.53,0.53,0.53,CAT
macro avg,0.53,0.53,0.52,15.0,CAT
weighted avg,0.53,0.53,0.53,15.0,CAT
