In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.decomposition import PCA

In [2]:
# sl: satisfaction_level --- False:MinMaxScaler; True:StandardScaler
# le: last_evaluation --- False:MinMaxScaler; True:StandardScaler
# npr: number_project --- False:MinMaxScaler; True:StandardScaler
# amh: average_monthly_hours --- False:MinMaxScaler; True:StandardScaler
# tsc: time_spend_company --- False:MinMaxScaler; True:StandardScaler
# wa: Work_accident --- False:MinMaxScaler; True:StandardScaler
# pl5: promotion_last_5years --- False:MinMaxScaler; True:StandardScaler
# dp: department --- False:LabelEncoder; True:OneHotEncoder
# slr: salary --- False:LabelEncoder; True:OneHotEncoder
def hr_preprocessing(sl=False, 
                     le=False, 
                     npr=False, 
                     amh=False, 
                     tsc=False, 
                     wa=False, 
                     pl5=False, 
                     dp=False, 
                     slr=False, 
                     lower_d=False, 
                     ld_n=1):
    df = pd.read_csv('./data/HR.csv')
    
    ## 1. clean the data
    #  remove the outliers
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level'] <= 1][df['salary']!='nme']
    
    ## 2. get the label
    label = df['left']
    df = df.drop('left', axis=1)

    ## 3. feature selection
    #  due to few features, we keep all the features
    
    ## 4. feature preprocessing
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', \
                  'average_monthly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = \
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
    
    scaler_lst = [dp, slr]
    column_lst = ['department', 'salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    if lower_d:
        #return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values)
    return df, label

In [3]:
d = dict([('low', 0), ('medium', 1), ('high', 2)])
def map_salary(s):
    return d.get(s, 0)

In [4]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    
    f_v = features.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    
    
    ## KNN
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    
    knn_clf = KNeighborsClassifier(n_neighbors=3)
    knn_clf_n5 = KNeighborsClassifier(n_neighbors=5)
    knn_clf.fit(X_train, Y_train)
    knn_clf_n5.fit(X_train, Y_train)
    
   
    
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    
    Y_pred = knn_clf.predict(X_train)
    print ("Train:")
    print ('ACC:', accuracy_score(Y_train, Y_pred))
    print ('REC:', recall_score(Y_train, Y_pred))
    print ('F-score:', f1_score(Y_train, Y_pred))
    
    Y_pred = knn_clf.predict(X_validation)
    print ('\nValidation:')
    print ('ACC:', accuracy_score(Y_validation, Y_pred))
    print ('REC:', recall_score(Y_validation, Y_pred))
    print ('F-score:', f1_score(Y_validation, Y_pred))
    
    Y_pred_n5 = knn_clf_n5.predict(X_validation)
    print ('ACC:', accuracy_score(Y_validation, Y_pred_n5))
    print ('REC:', recall_score(Y_validation, Y_pred_n5))
    print ('F-score:', f1_score(Y_validation, Y_pred_n5))
    
    Y_pred = knn_clf.predict(X_test)
    print ('\nX_test:')
    print ('ACC:', accuracy_score(Y_test, Y_pred))
    print ('REC:', recall_score(Y_test, Y_pred))
    print ('F-score:', f1_score(Y_test, Y_pred))
    
    ## save model
    from sklearn.externals import joblib
    joblib.dump(knn_clf, 'knn_clf')
    ## use model
    knn_clfjoblib.load('knn_clf')
    

In [5]:
def main():
    features, label = hr_preprocessing()
    hr_modeling(features, label)

In [6]:
if __name__ == '__main__':
    main()



Train:
ACC: 0.9755528392043561
REC: 0.9673413063477461
F-score: 0.9502937189335742

Validation:
ACC: 0.9506666666666667
REC: 0.921090387374462
F-score: 0.8966480446927375
ACC: 0.9513333333333334
REC: 0.9053084648493543
F-score: 0.8963068181818182

X_test:
ACC: 0.9546666666666667
REC: 0.9385714285714286
F-score: 0.9062068965517243
