# Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os
import time

# Basics
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder

#Pipelines and transformers
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

# Models
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import accuracy_score, classification_report,precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Useful

In [3]:
def check_shape_head(df : pd.DataFrame):
    assert isinstance(df, pd.DataFrame), f"{df} sould be a pandas dataframe"
    print(df.shape)
    return df.head(2)

# Retrieve data

In [4]:
!pwd

/home/thierry/code/Emroullier/hr-data-analytics/notebooks


In [5]:
# take environment variables from .env.
load_dotenv()

data_path = os.getenv("DATA_HR")
data_path = '../' + data_path

#Retrieve dataset from local directory
dataset = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)
check_shape_head(dataset)

(14999, 62)


Unnamed: 0,ID,Name,Department,GEO,Role,Rising_Star,Will_Relocate,Critical,Trending Perf,Talent_Level,Validated_Talent_Level,Percent_Remote,EMP_Sat_OnPrem_1,EMP_Sat_OnPrem_2,EMP_Sat_OnPrem_3,EMP_Sat_OnPrem_4,EMP_Sat_OnPrem_5,EMP_Sat_Remote_1,EMP_Sat_Remote_2,EMP_Sat_Remote_3,EMP_Sat_Remote_4,EMP_Sat_Remote_5,EMP_Engagement_1,EMP_Engagement_2,EMP_Engagement_3,EMP_Engagement_4,EMP_Engagement_5,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left_Company,CSR Factor,promotion_last_5years,sales,salary,Gender,LinkedIn_Hits,Emp_Work_Status2,Emp_Work_Status_3,Emp_Work_Status_4,Emp_Work_Status_5,Emp_Identity,Emp_Role,Emp_Position,Emp_Title,Women_Leave,Men_Leave,Emp_Competitive_1,Emp_Competitive_2,Emp_Competitive_3,Emp_Competitive_4,Emp_Competitive_5,Emp_Collaborative_1,Emp_Collaborative_2,Emp_Collaborative_3,Emp_Collaborative_4,Emp_Collaborative_5,Sensor_StepCount,Sensor_Heartbeat(Average/Min),Sensor_Proximity(1-highest/10-lowest)
0,1,BRADDY,Operations,US,VP,,0,1.0,3,6,6,0.4,3.0,3.0,4.0,3.0,5.0,2,3,2,3,2,5,5,4,4,3,0.36,3,168,3,1,0,,0,sales,low,M,5,4,3,1,1,1,1,2,1,,1.0,2,4,2,2,2,2,2,2,1,5,1841,61,9
1,2,BORST,Sales,UK,Senior Director,,0,,3,6,6,0.4,3.0,3.0,4.0,3.0,5.0,2,3,2,3,2,5,5,4,4,3,0.36,5,159,2,0,0,,0,accounting,low,F,58,3,3,4,3,2,1,2,1,0.0,,2,2,1,4,1,3,2,5,1,5,1990,90,8


# Data preparation

In [6]:
# Remove unwanted columns
dropped_columns = ['ID','Name','Rising_Star', 'Trending Perf', 'Talent_Level',
                     'Validated_Talent_Level', 'EMP_Sat_OnPrem_1', 'EMP_Sat_OnPrem_2',
                     'EMP_Sat_OnPrem_3','EMP_Sat_Remote_3', 'EMP_Sat_Remote_4','EMP_Sat_Remote_5',
                     'EMP_Engagement_2','EMP_Engagement_3','EMP_Engagement_4',
                     'EMP_Engagement_5','CSR Factor','sales']

dataset.drop(columns = dropped_columns, inplace = True)

In [7]:
# Rename columns
dataset = dataset.rename(columns={
                                    'Sensor_Heartbeat(Average/Min)': 'Sensor_Heartbeat',
                                    'Sensor_Proximity(1-highest/10-lowest)': 'Sensor_Proximity'
                                })

In [8]:
# Convert column names to lower snake case
dataset.columns = dataset.columns.str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('.', '_')

In [9]:
# combine all the men_leave and the women_leave column
dataset['leave'] = dataset['women_leave'].fillna(dataset['men_leave'])
dataset['leave'] = dataset['leave'].fillna(0)
dataset.drop(columns = ['women_leave', 'men_leave'], inplace = True)

In [10]:
# remove highly correlated features >= 0.7 or <= -0.7
dataset.drop(columns=['emp_sat_onprem_4','percent_remote',
                'emp_sat_remote_2','emp_sat_remote_1',
                'emp_engagement_1'], inplace = True)
dataset.shape

(14999, 38)

In [11]:
dataset.columns


Index(['department', 'geo', 'role', 'will_relocate', 'critical',
       'emp_sat_onprem_5', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'left_company', 'promotion_last_5years', 'salary', 'gender',
       'linkedin_hits', 'emp_work_status2', 'emp_work_status_3',
       'emp_work_status_4', 'emp_work_status_5', 'emp_identity', 'emp_role',
       'emp_position', 'emp_title', 'emp_competitive_1', 'emp_competitive_2',
       'emp_competitive_3', 'emp_competitive_4', 'emp_competitive_5',
       'emp_collaborative_1', 'emp_collaborative_2', 'emp_collaborative_3',
       'emp_collaborative_4', 'emp_collaborative_5', 'sensor_stepcount',
       'sensor_heartbeat', 'sensor_proximity', 'leave'],
      dtype='object')

In [12]:
job_posting = ['department','geo','role',
                'average_montly_hours',
                'salary']

applicant = ['will_relocate', 'gender']

dataset = dataset[job_posting + applicant + ['left_company']]

# Define X and y

In [13]:
#Features
X = dataset.drop(columns=['left_company'])

#Target
y = dataset['left_company']

In [14]:
check_shape_head(X)

(14999, 7)


Unnamed: 0,department,geo,role,average_montly_hours,salary,will_relocate,gender
0,Operations,US,VP,168,low,0,M
1,Sales,UK,Senior Director,159,low,0,F


# Pipeline

In [15]:
#Functions used in basic imputations
def imputer_critical(x):
    x = x.apply(lambda y: y.map({1 : 1, np.nan: 0}))
    return x

def imputer_gender(x):
    x = x.apply(lambda y: y.map({'F': 1, 'M': 0}))
    return x

# Preprocessor

ohe_scale_cols = ['department', 'geo', 'role']
ordinal_scale_cols = ['salary']
minmax_scale_cols = ['average_montly_hours']

preproc = make_column_transformer(
    # Basic imputations
    (FunctionTransformer(imputer_gender,feature_names_out ='one-to-one'), ['gender']),

    
    #Numerical preproc
    (MinMaxScaler(), minmax_scale_cols),
    
    #Categorical preproc
    (OneHotEncoder(sparse_output = False), ohe_scale_cols),
    (OrdinalEncoder(), ordinal_scale_cols),
    
    #Remaining columns pass
    remainder='passthrough',
    force_int_remainder_cols=False
)

# Train test split

In [16]:
# Create a train set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    random_state=10)

In [17]:
#Check shapes
print(f"X_train shape is : {X_train.shape}")
print(f"y_train shape is : {y_train.shape}\n")
print(f"X_test shape is : {X_test.shape}")
print(f"y_test shape is : {y_test.shape}")

X_train shape is : (10499, 7)
y_train shape is : (10499,)

X_test shape is : (4500, 7)
y_test shape is : (4500,)


# Models

In [18]:
# #Evaluate model
# def evaluate_model(model):
#     start_time = time.time()
    
#     # Pipeline
#     pipeline = make_pipeline(preproc, model)
    
#     # Train Pipeline
#     pipeline.fit(X_train,y_train)
    
#     #Evaluation
#     y_pred = pipeline.predict(X_test)
#     accuracy = accuracy_score(y_test, y_pred)
# #     print(classification_report(y_test, y_pred))

#     elapsed_time = time.time() - start_time

#     return round(accuracy,3), elapsed_time

## Accuracy

In [19]:
#Evaluate model
def evaluate_model(model):
    start_time = time.time()
    
    # Pipeline
    pipeline = make_pipeline(preproc, model)
    
    # Train Pipeline
    pipeline.fit(X_train,y_train)
    
    #Evaluation
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
#     print(classification_report(y_test, y_pred))

    elapsed_time = time.time() - start_time

    return round(accuracy,3), elapsed_time

In [20]:
#Accuracy 
eval_dict = {}
mdl = []
acc = []
comp_time = []

#Models
log_reg = LogisticRegression(max_iter=500)
svc = SVC()
SGD_Classifier = SGDClassifier()
KN_Classifier = KNeighborsClassifier()
GB_Classifier = GradientBoostingClassifier()
XGB_classifier = XGBClassifier()

# List of models (easier to read)
model_list = [log_reg, 
              svc, 
              SGD_Classifier, 
              KN_Classifier,
              GB_Classifier,
              XGB_classifier]

#Accuracy computation
for model in model_list:
    mdl.append(f"{model}")
    acc.append(evaluate_model(model)[0])
    comp_time.append(evaluate_model(model)[1])
eval_dict['Model'] = mdl
eval_dict['Accuracy'] = acc
eval_dict['Computation time'] = comp_time

result = pd.DataFrame(eval_dict).sort_values('Accuracy', ascending=False)
result

Unnamed: 0,Model,Accuracy,Computation time
4,GradientBoostingClassifier(),0.801,0.557195
5,"XGBClassifier(base_score=None, booster=None, c...",0.784,0.081634
0,LogisticRegression(max_iter=500),0.766,0.100613
1,SVC(),0.766,3.005934
2,SGDClassifier(),0.766,0.074126
3,KNeighborsClassifier(),0.735,0.080625


## Precision

In [21]:
#Evaluate model
def evaluate_model(model):
    start_time = time.time()
    
    # Pipeline
    pipeline = make_pipeline(preproc, model)
    
    # Train Pipeline
    pipeline.fit(X_train,y_train)
    
    #Evaluation
    y_pred = pipeline.predict(X_test)
    precision = precision_score(y_test, y_pred, zero_division=np.nan)

    elapsed_time = time.time() - start_time

    return round(precision,3), elapsed_time

In [22]:
# Precision 
eval_dict = {}
mdl = []
acc = []
comp_time = []

#Models
log_reg = LogisticRegression(max_iter=500)
svc = SVC()
SGD_Classifier = SGDClassifier()
KN_Classifier = KNeighborsClassifier()
GB_Classifier = GradientBoostingClassifier()
XGB_classifier = XGBClassifier()

# List of models (easier to read)
model_list = [log_reg, 
              svc, 
              SGD_Classifier, 
              KN_Classifier,
              GB_Classifier,
              XGB_classifier]

#Accuracy computation
for model in model_list:
    mdl.append(f"{model}")
    acc.append(evaluate_model(model)[0])
    comp_time.append(evaluate_model(model)[1])
eval_dict['Model'] = mdl
eval_dict['Precision'] = acc
eval_dict['Computation time'] = comp_time

result = pd.DataFrame(eval_dict).sort_values('Precision', ascending=False)
result

Unnamed: 0,Model,Precision,Computation time
4,GradientBoostingClassifier(),0.725,0.567879
5,"XGBClassifier(base_score=None, booster=None, c...",0.565,0.074569
3,KNeighborsClassifier(),0.339,0.076962
0,LogisticRegression(max_iter=500),,0.050209
1,SVC(),,2.997084
2,SGDClassifier(),,0.090186


## F1 score

In [23]:
#Evaluate model
def evaluate_model(model):
    start_time = time.time()
    
    # Pipeline
    pipeline = make_pipeline(preproc, model)
    
    # Train Pipeline
    pipeline.fit(X_train,y_train)
    
    #Evaluation
    y_pred = pipeline.predict(X_test)
    f1_sc = f1_score(y_test, y_pred)

    elapsed_time = time.time() - start_time

    return round(f1_sc,3), elapsed_time

In [24]:
#F1 score
eval_dict = {}
mdl = []
acc = []
comp_time = []

#Models
log_reg = LogisticRegression(max_iter=500)
svc = SVC()
SGD_Classifier = SGDClassifier()
KN_Classifier = KNeighborsClassifier()
GB_Classifier = GradientBoostingClassifier()
XGB_classifier = XGBClassifier()

# List of models (easier to read)
model_list = [log_reg, 
              svc, 
              SGD_Classifier, 
              KN_Classifier,
              GB_Classifier,
              XGB_classifier]

#Accuracy computation
for model in model_list:
    mdl.append(f"{model}")
    acc.append(evaluate_model(model)[0])
    comp_time.append(evaluate_model(model)[1])
eval_dict['Model'] = mdl
eval_dict['F1_score'] = acc
eval_dict['Computation time'] = comp_time

result = pd.DataFrame(eval_dict).sort_values('F1_score', ascending=False)
result

Unnamed: 0,Model,F1_score,Computation time
5,"XGBClassifier(base_score=None, booster=None, c...",0.419,0.078545
4,GradientBoostingClassifier(),0.356,0.59442
3,KNeighborsClassifier(),0.202,0.079092
0,LogisticRegression(max_iter=500),0.0,0.033538
1,SVC(),0.0,3.0929
2,SGDClassifier(),0.0,0.100971
