In [85]:
import sys
from time import gmtime, strftime
import os

import boto3
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split
import pickle as pkl

from app.inference.preprocess_utils import encode_categorical_col
from app import config




In [69]:
df = pd.read_csv('data/employee-attrition.csv')

In [35]:
preprocess_conf = {
    'encoders': {},
}

In [36]:

features_to_encode= [
    'BusinessTravel',
    'Department',
    'EducationField',
    'Gender',
    'JobRole',
    'MaritalStatus',
    'Over18',
    'OverTime',
]

for feature in features_to_encode:
    df, encoder = encode_categorical_col(df, feature)
    preprocess_conf['encoders'][feature] = encoder

df['Attrition'] = df['Attrition'] == 'Yes'

In [37]:
df.head()

Unnamed: 0,Age,Attrition,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,41,True,1102,1,2,1,1,2,94,3,...,False,False,True,False,False,False,True,True,False,True
1,49,False,279,8,1,1,2,3,61,2,...,False,True,False,False,False,True,False,True,True,False
2,37,True,1373,2,2,1,4,4,92,2,...,False,False,False,False,False,False,True,True,False,True
3,33,False,1392,3,4,1,5,4,56,3,...,False,True,False,False,False,True,False,True,False,True
4,27,False,591,2,1,1,7,1,40,3,...,False,False,False,False,False,True,False,True,True,False


In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(columns=['Attrition']),
    df['Attrition'],
    test_size=0.15,
    random_state=42
)

In [40]:
import xgboost as xgb
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

params = {
    'objective': 'binary:logistic',
    'n_estimators': 120,
    'scale_pos_weight': 5,
}

model = xgb.XGBClassifier(**params)
model = model.fit(X_train, y_train)

y_pred_binary = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
cm = confusion_matrix(y_test, y_pred_binary)

print(f"Accuracy: {accuracy:.2f}%")
print(f"F1: {f1:.2f}")


Accuracy: 0.87%
F1: 0.48


In [41]:
def format_confusion_matrix(cm):
    tn, fp, fn, tp = cm.ravel()
    
    table = f"{'':<10}{'Predicted 0':<12}{'Predicted 1':<12}\n"
    table += f"{'Actual 0':<10}{tn:<12}{fp:<12}\n"
    table += f"{'Actual 1':<10}{fn:<12}{tp:<12}"
    
    return table

print(format_confusion_matrix(cm))

          Predicted 0 Predicted 1 
Actual 0  180         10          
Actual 1  18          13          


In [65]:
model_root = 'model'
model_file_name = config.MODEL_FILENAME
preprocess_conf_file_name = config.PREPROCESS_FILENAME
model_archive = strftime("xgb-clf-%Y-%m-%d_%H-%M.tar.gz", gmtime())

with open(os.path.join(model_root, preprocess_conf_file_name), 'wb') as file:
    pkl.dump(preprocess_conf, file)

with open(os.path.join(model_root, model_file_name), 'wb') as file:
    pkl.dump(model, file)

In [66]:
!cd {model_root} && tar czvf {model_archive} *.pkl && rm *.pkl

model.pkl
preprocess_conf.pkl


In [67]:
region = boto3.Session().region_name
bucket = 'crayon-task'
prefix = 'models'
bucket_path = 'https://s3-{}.amazonaws.com/{}'.format(region,bucket)

In [68]:
file_obj = open(os.path.join(model_root, model_archive), 'rb')
key= os.path.join(prefix, model_archive)
boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(file_obj)