In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.decomposition import PCA

In [2]:
# sl: satisfaction_level --- False:MinMaxScaler; True:StandardScaler
# le: last_evaluation --- False:MinMaxScaler; True:StandardScaler
# npr: number_project --- False:MinMaxScaler; True:StandardScaler
# amh: average_monthly_hours --- False:MinMaxScaler; True:StandardScaler
# tsc: time_spend_company --- False:MinMaxScaler; True:StandardScaler
# wa: Work_accident --- False:MinMaxScaler; True:StandardScaler
# pl5: promotion_last_5years --- False:MinMaxScaler; True:StandardScaler
# dp: department --- False:LabelEncoder; True:OneHotEncoder
# slr: salary --- False:LabelEncoder; True:OneHotEncoder
def hr_preprocessing(sl=False, 
                     le=False, 
                     npr=False, 
                     amh=False, 
                     tsc=False, 
                     wa=False, 
                     pl5=False, 
                     dp=False, 
                     slr=False, 
                     lower_d=False, 
                     ld_n=1):
    df = pd.read_csv('./data/HR.csv')
    
    ## 1. clean the data
    #  remove the outliers
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level'] <= 1][df['salary']!='nme']
    
    ## 2. get the label
    label = df['left']
    df = df.drop('left', axis=1)

    ## 3. feature selection
    #  due to few features, we keep all the features
    
    ## 4. feature preprocessing
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', \
                  'average_monthly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = \
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
    
    scaler_lst = [dp, slr]
    column_lst = ['department', 'salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    if lower_d:
        #return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values)
    return df, label

In [3]:
d = dict([('low', 0), ('medium', 1), ('high', 2)])
def map_salary(s):
    return d.get(s, 0)

In [4]:
def main():
    print(hr_preprocessing(sl=True, le=True,dp=True,lower_d=False, ld_n=3))

In [5]:
if __name__ == '__main__':
    main()

[[ 1.46585949 -0.10918614  0.75463537]
 [-1.14419756  0.06419447  0.77153387]
 [ 0.65458118  2.19319331  0.7815528 ]
 ...
 [ 1.4824322  -0.06961459 -0.33758008]
 [ 0.31051382  2.51475274 -0.29018928]
 [ 1.52116434 -0.10415884 -0.33820685]]


