In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.decomposition import PCA

import os
os.environ['PATH'] += os.pathsep + '/usr/local/Cellar/graphviz/2.40.1/bin'
import pydotplus

In [2]:
# sl: satisfaction_level --- False:MinMaxScaler; True:StandardScaler
# le: last_evaluation --- False:MinMaxScaler; True:StandardScaler
# npr: number_project --- False:MinMaxScaler; True:StandardScaler
# amh: average_monthly_hours --- False:MinMaxScaler; True:StandardScaler
# tsc: time_spend_company --- False:MinMaxScaler; True:StandardScaler
# wa: Work_accident --- False:MinMaxScaler; True:StandardScaler
# pl5: promotion_last_5years --- False:MinMaxScaler; True:StandardScaler
# dp: department --- False:LabelEncoder; True:OneHotEncoder
# slr: salary --- False:LabelEncoder; True:OneHotEncoder
def hr_preprocessing(sl=False, 
                     le=False, 
                     npr=False, 
                     amh=False, 
                     tsc=False, 
                     wa=False, 
                     pl5=False, 
                     dp=False, 
                     slr=False, 
                     lower_d=False, 
                     ld_n=1):
    df = pd.read_csv('./data/HR.csv')
    
    ## 1. clean the data
    #  remove the outliers
    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])
    df = df[df['satisfaction_level'] <= 1][df['salary']!='nme']
    
    ## 2. get the label
    label = df['left']
    df = df.drop('left', axis=1)

    ## 3. feature selection
    #  due to few features, we keep all the features
    
    ## 4. feature preprocessing
    scaler_lst = [sl, le, npr, amh, tsc, wa, pl5]
    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', \
                  'average_monthly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df[column_lst[i]] = \
                StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
    
    scaler_lst = [dp, slr]
    column_lst = ['department', 'salary']
    for i in range(len(scaler_lst)):
        if not scaler_lst[i]:
            if column_lst[i] == 'salary':
                df[column_lst[i]] = [map_salary(s) for s in df['salary'].values]
            else:
                df[column_lst[i]] = LabelEncoder().fit_transform(df[column_lst[i]])
            df[column_lst[i]] = \
                MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1,1)).reshape(1, -1)[0]
        else:
            df = pd.get_dummies(df, columns=[column_lst[i]])
    if lower_d:
        #return LinearDiscriminantAnalysis(n_components=ld_n)
        return PCA(n_components=ld_n).fit_transform(df.values)
    return df, label

In [3]:
d = dict([('low', 0), ('medium', 1), ('high', 2)])
def map_salary(s):
    return d.get(s, 0)

In [4]:
def hr_modeling(features, label):
    from sklearn.model_selection import train_test_split
    
    f_v = features.values
    f_names = features.columns.values
    l_v = label.values
    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)
    
    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)
    
    
    ## KNN
    from sklearn.metrics import accuracy_score, recall_score, f1_score
    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
    from sklearn.naive_bayes import GaussianNB, BernoulliNB
    from sklearn.tree import DecisionTreeClassifier, export_graphviz
    from sklearn.externals.six import StringIO
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    
    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))
    models.append(('GaussianNB', GaussianNB()))
    models.append(('BernoulliNB', BernoulliNB()))
    models.append(('DecesionTreeGini', DecisionTreeClassifier()))
    models.append(('DecesionTreeEntropy', DecisionTreeClassifier(criterion='entropy')))
    models.append(('SVM Classifier', SVC(C=100)))
    models.append(('RandomForest', RandomForestClassifier(max_features=None, bootstrap=False)))
    models.append(('Adaboost', AdaBoostClassifier(n_estimators=100)))
    
    for clf_name, clf in models:
        clf.fit(X_train, Y_train)
        
        xy_lst = [(X_train, Y_train), (X_validation, Y_validation),(X_test, Y_test)]
        for i in range(len(xy_lst)):
            X_part = xy_lst[i][0]
            Y_part = xy_lst[i][1]
            
            Y_pred = clf.predict(X_part)
            
            print (i)
            print (clf_name, '-ACC:',accuracy_score(Y_part, Y_pred))
            print (clf_name, '-REC:',recall_score(Y_part, Y_pred))
            print (clf_name, '-F1:',f1_score(Y_part, Y_pred))
            
            ### draw the Decision Tree
            ## version 1
#             dot_data = export_graphviz(clf, 
#                                        out_file=None, 
#                                        feature_names=f_names,
#                                        class_names=['NL', 'L'], 
#                                        filled=True, 
#                                        rounded=True, 
#                                        special_characters=True)
#             graph = pydotplus.graph_from_dot_data(dot_data)
#             graph.write_pdf('dt_tree.pdf')
            
            ## version 2
#             dot_data = StringIO()
#             export_graphviz(clf,
#                             out_file=dot_data, 
#                             feature_names=f_names,
#                             class_names=['NL', 'L'], 
#                             filled=True, 
#                             rounded=True, 
#                             special_characters=True)
#             graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
#             graph.write_pdf('dt_tree_2.pdf')
  
    '''
    ## save model
    from sklearn.externals import joblib
    joblib.dump(knn_clf, 'knn_clf')
    ## use model
    knn_clfjoblib.load('knn_clf')
    '''
    

In [5]:
def main():
    features, label = hr_preprocessing()
    hr_modeling(features, label)

In [6]:
if __name__ == '__main__':
    main()



0
KNN -ACC: 0.9761084564951661
KNN -REC: 0.9632075471698113
KNN -F1: 0.9499883693882297
1
KNN -ACC: 0.9563333333333334
KNN -REC: 0.923728813559322
KNN -F1: 0.9089645587213343
2
KNN -ACC: 0.9566666666666667
KNN -REC: 0.9098250336473755
KNN -F1: 0.912280701754386
0
GaussianNB -ACC: 0.7877541949105457
GaussianNB -REC: 0.7410377358490566
GaussianNB -F1: 0.6219319081551862
1
GaussianNB -ACC: 0.7913333333333333
GaussianNB -REC: 0.7641242937853108
GaussianNB -F1: 0.6334894613583139
2
GaussianNB -ACC: 0.7903333333333333
GaussianNB -REC: 0.7617765814266487
GaussianNB -F1: 0.6428165814877911
0
BernoulliNB -ACC: 0.8413157017446383
BernoulliNB -REC: 0.46084905660377357
BernoulliNB -F1: 0.5777646363098758
1
BernoulliNB -ACC: 0.835
BernoulliNB -REC: 0.4519774011299435
BernoulliNB -F1: 0.5638766519823789
2
BernoulliNB -ACC: 0.8493333333333334
BernoulliNB -REC: 0.506056527590848
BernoulliNB -F1: 0.6245847176079734
0
DecesionTreeGini -ACC: 1.0
DecesionTreeGini -REC: 1.0
DecesionTreeGini -F1: 1.0
1
Dece



0
SVM Classifier -ACC: 0.9528836537393044
SVM Classifier -REC: 0.9070754716981132
SVM Classifier -F1: 0.9007025761124122
1
SVM Classifier -ACC: 0.955
SVM Classifier -REC: 0.903954802259887
SVM Classifier -F1: 0.9045936395759718
2
SVM Classifier -ACC: 0.9563333333333334
SVM Classifier -REC: 0.8909825033647375
SVM Classifier -F1: 0.9099656357388315




0
RandomForest -ACC: 1.0
RandomForest -REC: 1.0
RandomForest -F1: 1.0
1
RandomForest -ACC: 0.9763333333333334
RandomForest -REC: 0.9491525423728814
RandomForest -F1: 0.9498233215547702
2
RandomForest -ACC: 0.9786666666666667
RandomForest -REC: 0.9488559892328399
RandomForest -F1: 0.9565807327001358
0
Adaboost -ACC: 0.9654406045116124
Adaboost -REC: 0.9231132075471699
Adaboost -F1: 0.9263905325443788
1
Adaboost -ACC: 0.9596666666666667
Adaboost -REC: 0.9180790960451978
Adaboost -F1: 0.9148486980999296
2
Adaboost -ACC: 0.9626666666666667
Adaboost -REC: 0.8977119784656796
Adaboost -F1: 0.9225449515905946
