# Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os
import time

# Basics
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

#Pipelines and transformers
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

# Models
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Useful

In [3]:
def check_shape_head(df : pd.DataFrame):
    assert isinstance(df, pd.DataFrame), f"{df} sould be a pandas dataframe"
    print(df.shape)
    return df.head(2)

# Retrieve data

In [4]:
!pwd

/home/thierry/code/thcarole1/hr-data-analytics-thierry/hr-data-analytics/notebooks


In [5]:
# take environment variables from .env.
load_dotenv()

data_path = os.getenv("DATA_HR")
data_path = '../' + data_path

#Retrieve dataset from local directory
dataset = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
check_shape_head(dataset)

(14999, 62)


Unnamed: 0,ID,Name,Department,GEO,Role,Rising_Star,Will_Relocate,Critical,Trending Perf,Talent_Level,Validated_Talent_Level,Percent_Remote,EMP_Sat_OnPrem_1,EMP_Sat_OnPrem_2,EMP_Sat_OnPrem_3,EMP_Sat_OnPrem_4,EMP_Sat_OnPrem_5,EMP_Sat_Remote_1,EMP_Sat_Remote_2,EMP_Sat_Remote_3,EMP_Sat_Remote_4,EMP_Sat_Remote_5,EMP_Engagement_1,EMP_Engagement_2,EMP_Engagement_3,EMP_Engagement_4,EMP_Engagement_5,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left_Company,CSR Factor,promotion_last_5years,sales,salary,Gender,LinkedIn_Hits,Emp_Work_Status2,Emp_Work_Status_3,Emp_Work_Status_4,Emp_Work_Status_5,Emp_Identity,Emp_Role,Emp_Position,Emp_Title,Women_Leave,Men_Leave,Emp_Competitive_1,Emp_Competitive_2,Emp_Competitive_3,Emp_Competitive_4,Emp_Competitive_5,Emp_Collaborative_1,Emp_Collaborative_2,Emp_Collaborative_3,Emp_Collaborative_4,Emp_Collaborative_5,Sensor_StepCount,Sensor_Heartbeat(Average/Min),Sensor_Proximity(1-highest/10-lowest)
0,1,BRADDY,Operations,US,VP,,0,1.0,3,6,6,0.4,3.0,3.0,4.0,3.0,5.0,2,3,2,3,2,5,5,4,4,3,0.36,3,168,3,1,0,,0,sales,low,M,5,4,3,1,1,1,1,2,1,,1.0,2,4,2,2,2,2,2,2,1,5,1841,61,9
1,2,BORST,Sales,UK,Senior Director,,0,,3,6,6,0.4,3.0,3.0,4.0,3.0,5.0,2,3,2,3,2,5,5,4,4,3,0.36,5,159,2,0,0,,0,accounting,low,F,58,3,3,4,3,2,1,2,1,0.0,,2,2,1,4,1,3,2,5,1,5,1990,90,8


# Data preparation

In [6]:
# Remove unwanted columns
dropped_columns = ['ID','Name','Rising_Star', 'Trending Perf', 'Talent_Level',
                     'Validated_Talent_Level', 'EMP_Sat_OnPrem_1', 'EMP_Sat_OnPrem_2',
                     'EMP_Sat_OnPrem_3','EMP_Sat_Remote_3', 'EMP_Sat_Remote_4','EMP_Sat_Remote_5',
                     'EMP_Engagement_2','EMP_Engagement_3','EMP_Engagement_4',
                     'EMP_Engagement_5','CSR Factor','sales']

dataset.drop(columns = dropped_columns, inplace = True)

In [7]:
# Rename columns
dataset = dataset.rename(columns={
                                    'Sensor_Heartbeat(Average/Min)': 'Sensor_Heartbeat',
                                    'Sensor_Proximity(1-highest/10-lowest)': 'Sensor_Proximity'
                                })

In [8]:
# Convert column names to lower snake case
dataset.columns = dataset.columns.str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('.', '_')

In [9]:
# combine all the men_leave and the women_leave column
dataset['leave'] = dataset['women_leave'].fillna(dataset['men_leave'])
dataset['leave'] = dataset['leave'].fillna(0)
dataset.drop(columns = ['women_leave', 'men_leave'], inplace = True)

In [10]:
# remove highly correlated features >= 0.7 or <= -0.7
dataset.drop(columns=['emp_sat_onprem_4','percent_remote',
                'emp_sat_remote_2','emp_sat_remote_1',
                'emp_engagement_1'], inplace = True)
dataset.shape

(14999, 38)

# Specific preparation for applicants

In [11]:
job_posting = ['department','geo','role', 'critical','average_montly_hours','salary']
applicant = ['will_relocate', 'gender']

In [12]:
dataset = dataset[job_posting + applicant + ['left_company']]

# Define X and y

In [13]:
#Features
X = dataset.drop(columns=['left_company'])

#Target
y = dataset['left_company']

In [14]:
check_shape_head(X)

(14999, 8)


Unnamed: 0,department,geo,role,critical,average_montly_hours,salary,will_relocate,gender
0,Operations,US,VP,1.0,168,low,0,M
1,Sales,UK,Senior Director,,159,low,0,F


# Train test split

In [15]:
# Create a train set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=10)

In [16]:
#Check shapes
print(f"X_train shape is : {X_train.shape}")
print(f"y_train shape is : {y_train.shape}\n")
print(f"X_test shape is : {X_test.shape}")
print(f"y_test shape is : {y_test.shape}")

X_train shape is : (10499, 8)
y_train shape is : (10499,)

X_test shape is : (4500, 8)
y_test shape is : (4500,)


# Pipeline

In [17]:
#Functions used in basic imputations
def imputer_critical(x):
    x = x.apply(lambda y: y.map({1 : 1, np.nan: 0}))
    return x

def imputer_gender(x):
    x = x.apply(lambda y: y.map({'F': 1, 'M': 0}))
    return x

# Preprocessor
ohe_scale_cols = ['department', 'geo', 'role']
minmax_scale_cols = ['average_montly_hours']
ordinal_scale_cols = ['salary']


preproc = make_column_transformer(
    # Basic imputations
    (FunctionTransformer(imputer_gender,feature_names_out ='one-to-one'), ['gender']),
    (FunctionTransformer(imputer_critical,feature_names_out ='one-to-one'), ['critical']),
    
    #Numerical preproc
    (MinMaxScaler(), minmax_scale_cols),
    
    #Categorical preproc
    (OneHotEncoder(sparse_output = False), ohe_scale_cols),
    (OrdinalEncoder(), ordinal_scale_cols),
    
    #Remaining columns pass
    remainder='passthrough',
    force_int_remainder_cols=False
)

In [18]:
# Output X_train_encoded
X_train_encoded = pd.DataFrame(preproc.fit_transform(X_train), columns=preproc.get_feature_names_out())

# Output X_test_encoded
X_test_encoded = pd.DataFrame(preproc.fit_transform(X_test), columns=preproc.get_feature_names_out())

# Models

In [19]:
#Evaluate model
def evaluate_model(model):
    start_time = time.time()

    model.fit(X_train_encoded, y_train)
    
    #Evaluation
    y_pred = model.predict(X_test_encoded)
    accuracy = accuracy_score(y_test, y_pred)
#     print(classification_report(y_test, y_pred))

    elapsed_time = time.time() - start_time

    return round(accuracy,3), elapsed_time

In [20]:
eval_dict = {}
mdl = []
acc = []
comp_time = []

#Models
log_reg = LogisticRegression(max_iter=500)
svc = SVC()
SGD_Classifier = SGDClassifier()
KN_Classifier = KNeighborsClassifier()
GB_Classifier = GradientBoostingClassifier()
XGB_classifier = XGBClassifier()
RF_Classifier = RandomForestClassifier()

# List of models (easier to read)
model_list = [log_reg, 
              svc, 
              SGD_Classifier, 
              KN_Classifier,
              GB_Classifier,
              XGB_classifier,
              RF_Classifier]

#Accuracy computation
for model in model_list:
    mdl.append(f"{model}")
    acc.append(evaluate_model(model)[0])
    comp_time.append(evaluate_model(model)[1])
eval_dict['Model'] = mdl
eval_dict['Accuracy'] = acc
eval_dict['Computation time'] = comp_time

result = pd.DataFrame(eval_dict).sort_values('Accuracy', ascending=False)
result

Unnamed: 0,Model,Accuracy,Computation time
4,GradientBoostingClassifier(),0.805,0.560703
5,"XGBClassifier(base_score=None, booster=None, c...",0.789,0.073885
0,LogisticRegression(max_iter=500),0.766,0.025697
1,SVC(),0.766,3.141744
2,SGDClassifier(),0.766,0.063265
6,RandomForestClassifier(),0.765,0.645278
3,KNeighborsClassifier(),0.734,0.140393


# Fit chosen model on X_train (not encoded)

In [21]:
X_train.head(2)

Unnamed: 0,department,geo,role,critical,average_montly_hours,salary,will_relocate,gender
7380,Operations,China,Level 1,,163,medium,1,F
8703,Human Resources,Australia,Level 2-4,,232,low,1,M


In [22]:
pipe = make_pipeline(preproc,GB_Classifier)
pipe.fit(X_train, y_train)

# Extract applicants from X_test based on a specific posting

In [23]:
job_posting

['department', 'geo', 'role', 'critical', 'average_montly_hours', 'salary']

In [24]:
# Check the applicants for a specifi job
query = "@X_test['department'] == 'Operations'      and\
         @X_test['geo']=='UK'                    and\
         @X_test['role']== 'Level 2-4'           and\
         @X_test['average_montly_hours'] > 200   and\
         @X_test['salary']=='low'"

X_app = X_test.query(query)
X_app

Unnamed: 0,department,geo,role,critical,average_montly_hours,salary,will_relocate,gender
10182,Operations,UK,Level 2-4,,269,low,1,F
11460,Operations,UK,Level 2-4,,289,low,0,M
5574,Operations,UK,Level 2-4,,215,low,1,M
10476,Operations,UK,Level 2-4,,261,low,0,F
14472,Operations,UK,Level 2-4,,221,low,1,F
10218,Operations,UK,Level 2-4,,306,low,0,F
12378,Operations,UK,Level 2-4,,268,low,1,F
11466,Operations,UK,Level 2-4,,250,low,1,M
14454,Operations,UK,Level 2-4,,259,low,0,M
6810,Operations,UK,Level 2-4,,225,low,1,F


In [25]:
y_app = y_test[X_app.index]
y_app;

# Prediction

In [26]:
X_app_index = X_app.index

In [27]:
# Predict class for X.
y_app_pred = pipe.predict(X_app)
y_app_pred

array([0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0])

In [28]:
# Predict class probabilities for X
y_app_pred_proba = pipe.predict_proba(X_app)
y_app_pred_proba;

In [29]:
prediction = pd.DataFrame(y_app_pred_proba, columns = ['prob_stay', 'prob_leave'], index=X_app_index) 
prediction;

In [30]:
X_app_final = pd.merge(X_app, prediction, left_index=True, right_index=True)
X_app_final.drop(columns=['prob_leave'], inplace=True)
X_app_final.sort_values('prob_stay', ascending=False, inplace=True)

In [31]:
X_app_final['prob_stay'].iloc[0]

0.9642091345847756

In [32]:
X_app_final.iloc[0]

department              Operations
geo                             UK
role                     Level 2-4
critical                       NaN
average_montly_hours           215
salary                         low
will_relocate                    1
gender                           M
prob_stay                 0.964209
Name: 5574, dtype: object

In [33]:
cols = ['department','geo', 'average_montly_hours', 'salary','will_relocate', 'gender']
X_app_final = X_app_final[cols]

In [34]:
X_app_final

Unnamed: 0,department,geo,average_montly_hours,salary,will_relocate,gender
5574,Operations,UK,215,low,1,M
14472,Operations,UK,221,low,1,F
7446,Operations,UK,237,low,0,M
6810,Operations,UK,225,low,1,F
11466,Operations,UK,250,low,1,M
14454,Operations,UK,259,low,0,M
10182,Operations,UK,269,low,1,F
12378,Operations,UK,268,low,1,F
10476,Operations,UK,261,low,0,F
11460,Operations,UK,289,low,0,M


In [35]:
# file_path = '../raw_data/dummy_hiring.json'

# # X_app_final.iloc[0].to_json(file_path)
# X_app_final.to_json(file_path)