# Employee Promotion Prediction Model Training

In [51]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.combine import SMOTEENN

import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [52]:
df_emp=pd.read_csv('data/train_test_merged_data_cleaned.csv')

In [53]:
df_emp.head()

Unnamed: 0.1,Unnamed: 0,department,education,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,gender_m
0,0,7,2,1,35,5.0,8,0,49,0.0,0
1,1,4,0,1,30,5.0,4,0,60,0.0,1
2,2,7,0,1,34,3.0,7,0,50,0.0,1
3,3,7,0,2,39,1.0,10,0,50,0.0,1
4,4,8,0,1,45,3.0,2,0,73,0.0,1


In [54]:
df_emp=df_emp.drop('Unnamed: 0', axis=1)

In [55]:
df_emp.head()

Unnamed: 0,department,education,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,gender_m
0,7,2,1,35,5.0,8,0,49,0.0,0
1,4,0,1,30,5.0,4,0,60,0.0,1
2,7,0,1,34,3.0,7,0,50,0.0,1
3,7,0,2,39,1.0,10,0,50,0.0,1
4,8,0,1,45,3.0,2,0,73,0.0,1


### Split the merged dataset into the original Train and Test data

In [56]:
# Select the original train dataset
train_data_final = df_emp.iloc[:54808, :]

# Select the original test dataset
test_data_final = df_emp.iloc[54808:, :]


In [57]:
train_data_final.shape

(54808, 10)

In [58]:
train_data_final.head()

Unnamed: 0,department,education,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,gender_m
0,7,2,1,35,5.0,8,0,49,0.0,0
1,4,0,1,30,5.0,4,0,60,0.0,1
2,7,0,1,34,3.0,7,0,50,0.0,1
3,7,0,2,39,1.0,10,0,50,0.0,1
4,8,0,1,45,3.0,2,0,73,0.0,1


In [59]:
test_data_final.shape

(23490, 10)

In [60]:
test_data_final.head()

Unnamed: 0,department,education,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,is_promoted,gender_m
54808,8,0,1,24,3.0,1,0,77,,1
54809,2,0,1,31,3.0,5,0,51,,0
54810,7,0,1,31,1.0,4,0,47,,1
54811,5,0,3,31,2.0,9,0,65,,0
54812,1,0,1,30,4.0,7,0,61,,1


### Creating Independent and Dependent variables

In [61]:
x=train_data_final.drop('is_promoted', axis=1)
x

Unnamed: 0,department,education,no_of_trainings,age,previous_year_rating,length_of_service,awards_won?,avg_training_score,gender_m
0,7,2,1,35,5.0,8,0,49,0
1,4,0,1,30,5.0,4,0,60,1
2,7,0,1,34,3.0,7,0,50,1
3,7,0,2,39,1.0,10,0,50,1
4,8,0,1,45,3.0,2,0,73,1
...,...,...,...,...,...,...,...,...,...
54803,8,0,1,48,3.0,15,0,78,1
54804,4,2,1,37,2.0,6,0,56,0
54805,0,0,1,27,5.0,3,0,79,1
54806,7,0,1,29,1.0,2,0,45,1


In [62]:
y=train_data_final['is_promoted']
y

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
54803    0.0
54804    0.0
54805    0.0
54806    0.0
54807    0.0
Name: is_promoted, Length: 54808, dtype: float64

### SMOTEEN

In [63]:
sm = SMOTEENN()
x_resampled, y_resampled = sm.fit_resample(x, y)

### Train-test split

In [64]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x, y, test_size=0.3, random_state=42)

### Standardization of Data

In [65]:
minmax_scaler=MinMaxScaler()

In [66]:
minmax_scaler_fit=minmax_scaler.fit(xr_train)
xr_train=minmax_scaler_fit.transform(xr_train)

In [67]:
xr_train, xr_test, yr_train, yr_test = train_test_split(x, y, test_size=0.3, random_state=42)

### Model Training

#### Logistic Regression

In [68]:
model_lor = LogisticRegression(C=1,penalty='l2', solver='newton-cg')

# Creating the model on Training Data
model_lor_fit=model_lor.fit(xr_train,yr_train)
model_lor_pred=model_lor_fit.predict(xr_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(yr_test, model_lor_pred))
print(metrics.confusion_matrix(yr_test, model_lor_pred))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(yr_test, model_lor_pred, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))


              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     15110
         1.0       0.72      0.08      0.14      1333

    accuracy                           0.92     16443
   macro avg       0.82      0.54      0.55     16443
weighted avg       0.91      0.92      0.89     16443

[[15072    38]
 [ 1233   100]]
Accuracy of the model on Testing Sample Data: 0.89


#### Decision Tree Classifier

In [69]:
model_dt = DecisionTreeClassifier(criterion = "gini",random_state = 100,max_depth=6, min_samples_leaf=8)

# Creating the model on Training Data
model_dt_fit=model_dt.fit(xr_train,yr_train)
model_dt_pred=model_dt_fit.predict(xr_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(yr_test, model_dt_pred))
print(metrics.confusion_matrix(yr_test, model_dt_pred))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(yr_test, model_dt_pred, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     15110
         1.0       0.89      0.14      0.23      1333

    accuracy                           0.93     16443
   macro avg       0.91      0.57      0.60     16443
weighted avg       0.93      0.93      0.90     16443

[[15088    22]
 [ 1153   180]]
Accuracy of the model on Testing Sample Data: 0.9


#### Random Forest Classifier

In [70]:
model_rf = RandomForestClassifier(max_depth=10, n_estimators=100,criterion='entropy')

# Creating the model on Training Data
model_rf_fit=model_rf.fit(xr_train,yr_train)
model_rf_pred=model_rf_fit.predict(xr_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(yr_test, model_rf_pred))
print(metrics.confusion_matrix(yr_test, model_rf_pred))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(yr_test, model_rf_pred, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96     15110
         1.0       0.91      0.17      0.29      1333

    accuracy                           0.93     16443
   macro avg       0.92      0.59      0.63     16443
weighted avg       0.93      0.93      0.91     16443

[[15087    23]
 [ 1100   233]]
Accuracy of the model on Testing Sample Data: 0.91


#### AdaBoost Classifier

In [71]:
dt=DecisionTreeClassifier(max_depth=4)
model_ada = AdaBoostClassifier(n_estimators=200, estimator=dt ,learning_rate=0.01)

# Creating the model on Training Data
model_ada_fit=model_ada.fit(xr_train,yr_train)
model_ada_pred=model_ada_fit.predict(xr_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(yr_test, model_ada_pred))
print(metrics.confusion_matrix(yr_test, model_ada_pred))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(yr_test, model_ada_pred, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

              precision    recall  f1-score   support

         0.0       0.93      1.00      0.97     15110
         1.0       0.93      0.20      0.33      1333

    accuracy                           0.93     16443
   macro avg       0.93      0.60      0.65     16443
weighted avg       0.93      0.93      0.91     16443

[[15091    19]
 [ 1066   267]]
Accuracy of the model on Testing Sample Data: 0.91


#### XGB Classifier

In [72]:
model_xgb = XGBClassifier(max_depth=10, learning_rate=0.01, n_estimators=200, objective='binary:logistic', booster='gbtree')

# Creating the model on Training Data
model_xgb_fit=model_xgb.fit(xr_train,yr_train)
model_xgb_pred=model_xgb_fit.predict(xr_test)

# Measuring accuracy on Testing Data
print(metrics.classification_report(yr_test, model_xgb_pred))
print(metrics.confusion_matrix(yr_test, model_xgb_pred))

# Printing the Overall Accuracy of the model
F1_Score=metrics.f1_score(yr_test, model_xgb_pred, average='weighted')
print('Accuracy of the model on Testing Sample Data:', round(F1_Score,2))

              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97     15110
         1.0       0.92      0.31      0.47      1333

    accuracy                           0.94     16443
   macro avg       0.93      0.66      0.72     16443
weighted avg       0.94      0.94      0.93     16443

[[15075    35]
 [  914   419]]
Accuracy of the model on Testing Sample Data: 0.93


In [80]:
prediction = model_xgb.predict(np.array([[7, #department
                                      1, #education
                                      0, #gender
                                      1, #1 length of training
                                      24, # age
                                      3, #previous year rating
                                      1, #length of service
                                      0, #awards won
                                      77, #avg training score
                                      
                                     ]]))
print("Whether the Employee should get a Promotion : 1-> Promotion, and 0-> No Promotion :", prediction)

Whether the Employee should get a Promotion : 1-> Promotion, and 0-> No Promotion : [1]
