# Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os
import time

# Basics
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

#Pipelines and transformers
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

# Models
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

# Useful

In [3]:
def check_shape_head(df : pd.DataFrame):
    assert isinstance(df, pd.DataFrame), f"{df} sould be a pandas dataframe"
    print(df.shape)
    return df.head(2)

# Retrieve data

In [4]:
!pwd

/home/thierry/code/Emroullier/hr-data-analytics/notebooks


In [5]:
# take environment variables from .env.
load_dotenv()

data_path = os.getenv("DATA_HR")
data_path = "../" + data_path

#Retrieve dataset from local directory
dataset = pd.read_csv(data_path)

pd.set_option('display.max_columns', 6)
check_shape_head(dataset)

(14999, 62)


Unnamed: 0,ID,Name,Department,...,Sensor_StepCount,Sensor_Heartbeat(Average/Min),Sensor_Proximity(1-highest/10-lowest)
0,1,BRADDY,Operations,...,1841,61,9
1,2,BORST,Sales,...,1990,90,8


# Data preparation

In [6]:
# Remove unwanted columns
dropped_columns = ['ID','Name','Rising_Star', 'Trending Perf', 'Talent_Level',
                     'Validated_Talent_Level', 'EMP_Sat_OnPrem_1', 'EMP_Sat_OnPrem_2',
                     'EMP_Sat_OnPrem_3','EMP_Sat_Remote_3', 'EMP_Sat_Remote_4','EMP_Sat_Remote_5',
                     'EMP_Engagement_2','EMP_Engagement_3','EMP_Engagement_4',
                     'EMP_Engagement_5','CSR Factor','sales']

dataset.drop(columns = dropped_columns, inplace = True)

In [7]:
# Rename columns
dataset = dataset.rename(columns={
                                    'Sensor_Heartbeat(Average/Min)': 'Sensor_Heartbeat',
                                    'Sensor_Proximity(1-highest/10-lowest)': 'Sensor_Proximity'
                                })

In [8]:
# Convert column names to lower snake case
dataset.columns = dataset.columns.str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('.', '_')

In [9]:
# combine all the men_leave and the women_leave column
dataset['leave'] = dataset['women_leave'].fillna(dataset['men_leave'])
dataset['leave'] = dataset['leave'].fillna(0)
dataset.drop(columns = ['women_leave', 'men_leave'], inplace = True)

In [10]:
# remove highly correlated features >= 0.7 or <= -0.7
dataset.drop(columns=['emp_sat_onprem_4','percent_remote',
                'emp_sat_remote_2','emp_sat_remote_1',
                'emp_engagement_1'], inplace = True)
dataset.shape

(14999, 38)

In [11]:
dataset['left_company'].iloc[470:485].value_counts()

left_company
1    8
0    7
Name: count, dtype: int64

# Define X and y

In [12]:
#Features
X = dataset.drop(columns=['left_company'])

#Target
y = dataset['left_company']

In [13]:
check_shape_head(X)

(14999, 37)


Unnamed: 0,department,geo,role,...,sensor_heartbeat,sensor_proximity,leave
0,Operations,US,VP,...,61,9,1.0
1,Sales,UK,Senior Director,...,90,8,0.0


# Train test split

In [14]:
# Create a train set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=10)

In [15]:
#Check shapes
print(f"X_train shape is : {X_train.shape}")
print(f"y_train shape is : {y_train.shape}\n")
print(f"X_test shape is : {X_test.shape}")
print(f"y_test shape is : {y_test.shape}")

X_train shape is : (10499, 37)
y_train shape is : (10499,)

X_test shape is : (4500, 37)
y_test shape is : (4500,)


# Pipeline

In [16]:
#Functions used in basic imputations
def imputer_critical(x):
    x = x.apply(lambda y: y.map({1 : 1, np.nan: 0}))
    return x

def imputer_gender(x):
    x = x.apply(lambda y: y.map({'F': 1, 'M': 0}))
    return x

# Preprocessor
simp_impute_scale_cols = ['emp_sat_onprem_5']
robust_scale_cols = ['time_spend_company']
ohe_scale_cols = ['department', 'geo', 'role']
ordinal_scale_cols = ['salary']
minmax_scale_cols = ['last_evaluation','number_project','average_montly_hours',
                     'linkedin_hits','sensor_stepcount','sensor_heartbeat']

preproc = make_column_transformer(
    # Basic imputations
    (FunctionTransformer(imputer_gender,feature_names_out ='one-to-one'), ['gender']),
    (FunctionTransformer(imputer_critical,feature_names_out ='one-to-one'), ['critical']),
    (SimpleImputer(strategy='most_frequent'), simp_impute_scale_cols),
    
    #Numerical preproc
    (MinMaxScaler(), minmax_scale_cols),
    (RobustScaler(), robust_scale_cols),
    
    #Categorical preproc
    (OneHotEncoder(sparse_output = False), ohe_scale_cols),
    (OrdinalEncoder(), ordinal_scale_cols),
    
    #Remaining columns pass
    remainder='passthrough',
    force_int_remainder_cols=False
)

preproc

In [17]:
# Output X_train_encoded
X_train_encoded = pd.DataFrame(preproc.fit_transform(X_train), columns=preproc.get_feature_names_out())

# Output X_test_encoded
X_test_encoded = pd.DataFrame(preproc.fit_transform(X_test), columns=preproc.get_feature_names_out())

# Models

In [18]:
#Evaluate model
def evaluate_model(model):
    start_time = time.time()

    model.fit(X_train_encoded, y_train)
    
    #Evaluation
    y_pred = model.predict(X_test_encoded)
    accuracy = accuracy_score(y_test, y_pred)
#     print(classification_report(y_test, y_pred))

    elapsed_time = time.time() - start_time

    return round(accuracy,3), elapsed_time

In [19]:
eval_dict = {}
mdl = []
acc = []
comp_time = []

#Models
log_reg = LogisticRegression(max_iter=500)
svc = SVC()
SGD_Classifier = SGDClassifier()
KN_Classifier = KNeighborsClassifier()
GB_Classifier = GradientBoostingClassifier()
XGB_classifier = XGBClassifier()
RF_Classifier = RandomForestClassifier()

# List of models (easier to read)
model_list = [log_reg, 
              svc, 
              SGD_Classifier, 
              KN_Classifier,
              GB_Classifier,
              XGB_classifier,
              RF_Classifier]

#Accuracy computation
for model in model_list:
    mdl.append(f"{model}")
    acc.append(evaluate_model(model)[0])
    comp_time.append(evaluate_model(model)[1])
eval_dict['Model'] = mdl
eval_dict['Accuracy'] = acc
eval_dict['Computation time'] = comp_time

result = pd.DataFrame(eval_dict).sort_values('Accuracy', ascending=False)
result

Unnamed: 0,Model,Accuracy,Computation time
5,"XGBClassifier(base_score=None, booster=None, c...",0.995,0.158171
4,GradientBoostingClassifier(),0.994,1.920837
6,RandomForestClassifier(),0.994,0.527653
1,SVC(),0.908,1.69121
0,LogisticRegression(max_iter=500),0.874,0.278047
3,KNeighborsClassifier(),0.853,0.163054
2,SGDClassifier(),0.852,0.115208


# Fit chosen model on X_train (not encoded)

In [20]:
X_train.head(2)

Unnamed: 0,department,geo,role,...,sensor_heartbeat,sensor_proximity,leave
7380,Operations,China,Level 1,...,73,9,1.0
8703,Human Resources,Australia,Level 2-4,...,80,5,0.0


In [21]:
pipe = make_pipeline(preproc,XGB_classifier)
pipe.fit(X_train, y_train)

# Prediction

In [31]:
X_test = dataset.iloc[470:485]
X_test

Unnamed: 0,department,geo,role,...,sensor_heartbeat,sensor_proximity,leave
470,Finance,China,Level 2-4,...,73,9,1.0
471,Human Resources,Turkey,Level 2-4,...,67,9,1.0
472,IT,France,Level 2-4,...,74,10,0.0
473,Warehouse,France,Level 2-4,...,86,9,1.0
474,Operations,France,Level 2-4,...,66,6,0.0
475,Sales,Korea,Level 2-4,...,65,7,0.0
476,Finance,Australia,Level 2-4,...,63,7,1.0
477,Human Resources,Japan,Level 2-4,...,60,7,0.0
478,IT,Colombia,Level 2-4,...,80,6,1.0
479,Warehouse,US,Level 2-4,...,77,7,1.0


In [23]:
X_test_index = X_test.index

In [24]:
# Predict class for X.
y_pred = pipe.predict(X_test)
y_pred;

In [25]:
# Predict class probabilities for X
y_pred_proba = pipe.predict_proba(X_test)
y_pred_proba;

In [26]:
prediction = pd.DataFrame(y_pred_proba, columns = ['prob_stay', 'prob_leave'], index=X_test_index) 
prediction;

In [27]:
X_test_final = pd.merge(X_test, prediction, left_index=True, right_index=True)
X_test_final.drop(columns=['prob_stay'], inplace=True)
X_test_final.sort_values('prob_leave', ascending=False, inplace=True)

In [28]:
X_test_final

Unnamed: 0,department,geo,role,...,sensor_proximity,leave,prob_leave
472,IT,France,Level 2-4,...,10,0.0,0.999249
475,Sales,Korea,Level 2-4,...,7,0.0,0.999148
477,Human Resources,Japan,Level 2-4,...,7,0.0,0.998958
471,Human Resources,Turkey,Level 2-4,...,9,1.0,0.998715
476,Finance,Australia,Level 2-4,...,7,1.0,0.997558
470,Finance,China,Level 2-4,...,9,1.0,0.997269
474,Operations,France,Level 2-4,...,6,0.0,0.996855
473,Warehouse,France,Level 2-4,...,9,1.0,0.888853
484,IT,China,Level 2-4,...,7,1.0,0.470062
483,Human Resources,Korea,Level 2-4,...,8,1.0,0.000616
