# Import libraries

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from dotenv import load_dotenv
import os

# Basics
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder,StandardScaler

# Transformers
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import FunctionTransformer

# Retrieve data

In [3]:
# take environment variables from .env.
load_dotenv()

data_path = os.getenv("DATA_HR")
data_path = '../' + data_path

#Retrieve dataset from local directory
dataset = pd.read_csv(data_path)

pd.set_option('display.max_columns', None)

# Data preparation

In [4]:
# Remove unwanted columns
dropped_columns = ['ID','Name','Rising_Star', 'Trending Perf', 'Talent_Level',
                     'Validated_Talent_Level', 'EMP_Sat_OnPrem_1', 'EMP_Sat_OnPrem_2',
                     'EMP_Sat_OnPrem_3','EMP_Sat_Remote_3', 'EMP_Sat_Remote_4','EMP_Sat_Remote_5',
                     'EMP_Engagement_2','EMP_Engagement_3','EMP_Engagement_4',
                     'EMP_Engagement_5','CSR Factor','sales']

dataset.drop(columns = dropped_columns, inplace = True)

In [5]:
# Rename columns
dataset = dataset.rename(columns={
                                    'Sensor_Heartbeat(Average/Min)': 'Sensor_Heartbeat',
                                    'Sensor_Proximity(1-highest/10-lowest)': 'Sensor_Proximity'
                                })

In [6]:
# Convert column names to lower snake case
dataset.columns = dataset.columns.str.lower().str.replace(' ', '_').str.replace('-', '_').str.replace('.', '_')

In [7]:
# combine all the men_leave and the women_leave column
dataset['leave'] = dataset['women_leave'].fillna(dataset['men_leave'])
dataset['leave'] = dataset['leave'].fillna(0)
dataset.drop(columns = ['women_leave', 'men_leave'], inplace = True)

In [8]:
# remove highly correlated features >= 0.7 or <= -0.7
dataset.drop(columns=['emp_sat_onprem_4','percent_remote',
                'emp_sat_remote_2','emp_sat_remote_1',
                'emp_engagement_1'], inplace = True)

# Define X and y

In [9]:
#Features
X = dataset.drop(columns=['left_company'])

#Target
y = dataset['left_company']

# Preprocessing

In [10]:
#Functions used in basic imputations
def imputer_critical(x):
    x = x.apply(lambda y: y.map({1 : 1, np.nan: 0}))
    return x

def imputer_gender(x):
    x = x.apply(lambda y: y.map({'F': 1, 'M': 0}))
    return x

# Preprocessor
simp_impute_scale_cols = ['emp_sat_onprem_5']
robust_scale_cols = ['time_spend_company']
ohe_scale_cols = ['department', 'geo', 'role']
ordinal_scale_cols = ['salary']
minmax_scale_cols = ['last_evaluation','number_project','average_montly_hours',
                     'linkedin_hits','sensor_stepcount','sensor_heartbeat']

preproc = make_column_transformer(
    # Basic imputations
    (FunctionTransformer(imputer_gender,feature_names_out ='one-to-one'), ['gender']),
    (FunctionTransformer(imputer_critical,feature_names_out ='one-to-one'), ['critical']),
    (SimpleImputer(strategy='most_frequent'), simp_impute_scale_cols),
    
    #Numerical preproc
    (MinMaxScaler(), minmax_scale_cols),
    (RobustScaler(), robust_scale_cols),
    
    #Categorical preproc
    (OneHotEncoder(sparse_output = False), ohe_scale_cols),
    (OrdinalEncoder(), ordinal_scale_cols),
    
    #Remaining columns pass
    remainder='passthrough',
    force_int_remainder_cols=False
)

preproc

In [11]:
# Output X_encoded
X_encoded = pd.DataFrame(preproc.fit_transform(X), columns=preproc.get_feature_names_out())