In [1]:
# import the required packages
import numpy as np
import pandas as pd

# for modelling
import statsmodels.api as sm
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, cross_validate
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.encoding import OrdinalEncoder
from sklearn.pipeline import Pipeline
import feature_engine.imputation as mdi

# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

In [14]:
# Enter your code here:
df = pd.read_csv('hr_data_v2.csv') # Import dataset

In [52]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [67]:
df.columns

Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'kpis_met_>80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')

In [69]:
numerical = ['no_of_trainings', 'age', 'previous_year_rating', 'length_of_service', 'avg_training_score']

In [70]:
categorical = ['department', 'region', 'education', 'gender', 'recruitment_channel', 'kpis_met_>80%', 'awards_won?', 'is_promoted']

In [71]:
for n in numerical:
    print(df[n].describe())
    print()

for c in categorical:
    print(df[c].value_counts())
    print()

count    9336.000000
mean        1.235433
std         0.572586
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: no_of_trainings, dtype: float64

count    9336.000000
mean       34.652421
std         7.384614
min        20.000000
25%        29.000000
50%        33.000000
75%        38.000000
max        60.000000
Name: age, dtype: float64

count    8657.000000
mean        3.639367
std         1.209850
min         1.000000
25%         3.000000
50%         4.000000
75%         5.000000
max         5.000000
Name: previous_year_rating, dtype: float64

count    9336.000000
mean        5.818873
std         4.126219
min         1.000000
25%         3.000000
50%         5.000000
75%         7.000000
max        34.000000
Name: length_of_service, dtype: float64

count    9336.000000
mean       66.974400
std        14.559774
min        40.000000
25%        54.000000
50%        64.000000
75%        80.000000
max        99.000000
Name: avg

In [54]:
dfy = df.is_promoted # model output
dfX = df.drop(['is_promoted', 'employee_id'], axis=1) # Model Inputs

In [55]:
# Split both Inputs (X) and Output (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.3, random_state=2)

In [56]:
text_columns = ['department', 'education', 'gender', 'recruitment_channel']

In [57]:
# Build Pipeline
xgb_pipeline = Pipeline(steps=[
    ("imputer", ArbitraryNumberImputer(
        arbitrary_number=3,
        variables=['previous_year_rating']
    )),
    
    ("ordinal_enc", OrdinalEncoder(
        encoding_method='ordered',
        variables=text_columns
    )),

    ("model", XGBClassifier(
        n_estimators=200,
        min_child_weight=1,
        learning_rate=0.1,
        max_depth=4,
        subsample=1.0,
        colsample_bytree=0.6,
        gamma=0.1,
        random_state=2,
        n_jobs=-1,
        tree_method='hist'
    ))
])

In [58]:
# Fit the entire pipeline
xgb_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('ordinal_enc', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,arbitrary_number,3
,variables,['previous_year_rating']
,imputer_dict,

0,1,2
,encoding_method,'ordered'
,variables,"['department', 'education', ...]"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6
,device,
,early_stopping_rounds,
,enable_categorical,False


In [59]:
with open('xgb_model.bin', 'wb') as f_out:
    pickle.dump(xgb_pipeline, f_out)

In [60]:
with open('xgb_model.bin', 'rb') as f_in:
    xgb_pipeline = pickle.load(f_in)

In [73]:
payload = X_test.iloc[[10]].to_dict(orient='records')[0]
payload

{'department': 'Sales & Marketing',
 'region': 7,
 'education': "Bachelor's",
 'gender': 'Male',
 'recruitment_channel': 'Other',
 'no_of_trainings': 1,
 'age': 31,
 'previous_year_rating': 3.0,
 'length_of_service': 7,
 'kpis_met_>80%': 0,
 'awards_won?': 0,
 'avg_training_score': 54}

In [74]:
data_point = pd.DataFrame([payload])

In [75]:
y_test.iloc[10]

np.int64(0)

In [76]:
proba = xgb_pipeline.predict_proba(data_point)[0, 1]
print('prob of churning =', proba)

if proba >= 0.5:
    print('promoted')
else:
    print('not promoted')

prob of churning = 0.07820247
not promoted
