In [1]:
# import the required packages
import numpy as np
import pandas as pd

# for modelling
import statsmodels.api as sm
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score, cross_validate
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.encoding import OrdinalEncoder
from sklearn.pipeline import Pipeline
import feature_engine.imputation as mdi

# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

In [14]:
# Enter your code here:
df = pd.read_csv('hr_data_v2.csv') # Import dataset

In [15]:
dfy = df.is_promoted # model output
dfX = df.drop(['is_promoted', 'employee_id'], axis=1) # Model Inputs

In [16]:
# Split both Inputs (X) and Output (y) into training set (70%) and testing set (30%)
X_train, X_test, y_train, y_test = train_test_split(dfX, dfy, test_size=0.3, random_state=2)

In [17]:
text_columns = ['department', 'education', 'gender', 'recruitment_channel']

In [18]:
# Build Pipeline
xgb_pipeline = Pipeline(steps=[
    ("imputer", ArbitraryNumberImputer(
        arbitrary_number=3,
        variables=['previous_year_rating']
    )),
    
    ("ordinal_enc", OrdinalEncoder(
        encoding_method='ordered',
        variables=text_columns
    )),

    ("model", XGBClassifier(
        n_estimators=200,
        min_child_weight=1,
        learning_rate=0.1,
        max_depth=4,
        subsample=1.0,
        colsample_bytree=0.6,
        gamma=0.1,
        random_state=2,
        n_jobs=-1,
        tree_method='hist'
    ))
])

In [19]:
# Fit the entire pipeline
xgb_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('ordinal_enc', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,arbitrary_number,3
,variables,['previous_year_rating']
,imputer_dict,

0,1,2
,encoding_method,'ordered'
,variables,"['department', 'education', ...]"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.6
,device,
,early_stopping_rounds,
,enable_categorical,False


In [20]:
with open('xgb_model.bin', 'wb') as f_out:
    pickle.dump(xgb_pipeline, f_out)

In [21]:
with open('xgb_model.bin', 'rb') as f_in:
    xgb_pipeline = pickle.load(f_in)

In [46]:
payload = X_test.iloc[[5]].to_dict(orient='records')[0]

In [47]:
data_point = pd.DataFrame([payload])

In [48]:
y_test.iloc[5]

np.int64(1)

In [49]:
proba = xgb_pipeline.predict_proba(data_point)[0, 1]
print('prob of churning =', proba)

if proba >= 0.5:
    print('promoted')
else:
    print('not promoted')

prob of churning = 0.776901
promoted
