In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.decomposition import PCA

In [2]:
# sl: satisfaction_level --- False:MinMaxScaler; True:StandardScaler
# le: last_evaluation --- False:MinMaxScaler; True:StandardScaler
# npr: number_project --- False:MinMaxScaler; True:StandardScaler
# amh: average_monthly_hours --- False:MinMaxScaler; True:StandardScaler
# tsc: time_spend_company --- False:MinMaxScaler; True:StandardScaler
# wa: Work_accident --- False:MinMaxScaler; True:StandardScaler
# pl5: promotion_last_5years --- False:MinMaxScaler; True:StandardScaler
# dp: department --- False:LabelEncoder; True:OneHotEncoder
# slr: salary --- False:LabelEncoder; True:OneHotEncoder
def hr_preprocessing(sl=False, 
                     le=False, 
                     npr=False, 
                     amh=False, 
                     tsc=False, 
                     wa=False, 
                     pl5=False, 
                     dp=False, 
                     slr=False, 
                     lower_d=False, 
                     ld_n=1):
    df = pd.read_csv('./data/HR.csv')
    
    ## 1. clean the data
    #  remove the outliers
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level'] <= 1][df['salary']!='nme']
    
    ## 2. get the label
    label = df['left']
    df = df.drop('left', axis=1)

    ## 3. feature selection
    #  due to few features, we keep all the features
    
    ## 4. feature preprocessing
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', \
                  'average_monthly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = \
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
    
    scaler_lst = [dp, slr]
    column_lst = ['department', 'salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    if lower_d:
        #return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values)
    return df, label

In [3]:
d = dict([('low', 0), ('medium', 1), ('high', 2)])
def map_salary(s):
    return d.get(s, 0)

In [4]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    
    f_v = features.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    
    
    ## KNN
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB, BernoulliNB
    
    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    
    for clf_name, clf in models:
        clf.fit(X_train, Y_train)
        
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation),(X_test, Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            
            Y_pred = clf.predict(X_part)
            
            print (i)
            print (clf_name, '-ACC:',accuracy_score(Y_part, Y_pred))
            print (clf_name, '-REC:',recall_score(Y_part, Y_pred))
            print (clf_name, '-F1:',f1_score(Y_part, Y_pred))
  
    '''
    ## save model
    from sklearn.externals import joblib
    joblib.dump(knn_clf, 'knn_clf')
    ## use model
    knn_clfjoblib.load('knn_clf')
    '''
    

In [5]:
def main():
    features, label = hr_preprocessing()
    hr_modeling(features, label)

In [6]:
if __name__ == '__main__':
    main()



0
KNN -ACC: 0.975330592288032
KNN -REC: 0.9626477541371158
KNN -F1: 0.9482999534233814
1
KNN -ACC: 0.9483333333333334
KNN -REC: 0.9230769230769231
KNN -F1: 0.8913805185704275
2
KNN -ACC: 0.956
KNN -REC: 0.9361147327249022
KNN -F1: 0.9158163265306122
0
GaussianNB -ACC: 0.8045338370930103
GaussianNB -REC: 0.7252955082742317
GaussianNB -F1: 0.6355914646778538
1
GaussianNB -ACC: 0.8033333333333333
GaussianNB -REC: 0.7343976777939042
GaussianNB -F1: 0.6317103620474406
2
GaussianNB -ACC: 0.7926666666666666
GaussianNB -REC: 0.7131681877444589
GaussianNB -F1: 0.6375291375291374
0
BernoulliNB -ACC: 0.845427269696633
BernoulliNB -REC: 0.4723404255319149
BernoulliNB -F1: 0.5895544408380053
1
BernoulliNB -ACC: 0.844
BernoulliNB -REC: 0.46879535558780844
BernoulliNB -F1: 0.5798922800718134
2
BernoulliNB -ACC: 0.8276666666666667
BernoulliNB -REC: 0.46153846153846156
BernoulliNB -F1: 0.5779591836734694
