# 一、資料匯入及預處理

### 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25) #設定pandas最多顯示出25個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
7173,1.0,0.61,5,264,3,0,0,0,RandD,medium
7913,0.84,0.98,4,134,5,1,0,0,support,medium
13860,0.91,0.77,3,195,7,0,0,0,sales,medium
3491,0.83,0.99,4,226,2,0,0,0,technical,low
9837,0.86,0.9,4,162,3,0,0,0,product_mng,low


### One-hot Encoding

In [2]:
#針對工作職稱進行標籤化
df_job = pd.get_dummies(df['dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
6468,0,0,0,0,0,0,0,0,0,1
12575,0,0,0,0,0,0,0,0,0,1
9392,0,0,0,0,1,0,0,0,0,0
11085,1,0,0,0,0,0,0,0,0,0
10768,0,0,0,0,0,1,0,0,0,0


In [3]:
#將標籤化的dataframe與原先的資料進行結合
#使用index對準的結合方式
df_ml = pd.merge(df, df_job,left_index=True,right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
7510,0.96,0.72,4,228,2,0,0,0,accounting,high,0,0,1,0,0,0,0,0,0,0
3148,0.99,0.61,3,167,3,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0
2449,0.91,0.7,3,132,4,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0
9848,0.71,0.57,3,207,3,0,0,0,marketing,medium,0,0,0,0,0,1,0,0,0,0
10626,0.16,0.54,5,206,5,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0


### LabelEncoding

In [4]:
#將薪資改以整數形式代表
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
127,0.7,0.89,3,183,5,0,1,0,support,low,0,0,0,0,0,0,0,0,1,0,1
4409,0.86,0.66,4,191,2,0,0,0,technical,low,0,0,0,0,0,0,0,0,0,1,1
12620,0.8,0.75,3,268,2,0,1,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2
14519,0.41,0.48,2,145,3,0,1,0,accounting,medium,0,0,1,0,0,0,0,0,0,0,2
3512,0.52,0.92,3,214,3,0,0,0,accounting,low,0,0,1,0,0,0,0,0,0,0,1


In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

In [7]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
X_train.head()


Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
6842,0.76,0.84,5,249,3,0,0,0,0,0,0,0,0,0,0,1,0,1
4193,0.81,0.9,4,165,3,0,0,0,1,0,0,0,0,0,0,0,0,2
13183,0.89,0.74,5,260,6,0,0,0,0,0,0,0,0,0,0,0,1,2
1398,0.39,0.55,2,156,3,0,0,0,0,0,0,0,0,0,1,0,0,3
762,0.41,0.47,2,131,3,0,0,0,0,0,0,0,1,0,0,0,0,2


In [8]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
print(X_train_std[:5])

[[ 0.59802365  0.7225669   0.96152491  0.95307949 -0.34556696 -0.41207404
  -0.14424632 -0.29699294 -0.23982098 -0.23285685 -0.23194624 -0.20889751
  -0.24814784 -0.25329773 -0.61231897  2.39376862 -0.46960633 -0.9299731 ]
 [ 0.79840568  1.07400903  0.15333901 -0.72622934 -0.34556696 -0.41207404
  -0.14424632 -0.29699294  4.16977701 -0.23285685 -0.23194624 -0.20889751
  -0.24814784 -0.25329773 -0.61231897 -0.41775132 -0.46960633  0.64103053]
 [ 1.11901693  0.13683001  0.96152491  1.17298898  1.71702972 -0.41207404
  -0.14424632 -0.29699294 -0.23982098 -0.23285685 -0.23194624 -0.20889751
  -0.24814784 -0.25329773 -0.61231897 -0.41775132  2.12944317  0.64103053]
 [-0.88480338 -0.97607008 -1.46303278 -0.90615528 -0.34556696 -0.41207404
  -0.14424632 -0.29699294 -0.23982098 -0.23285685 -0.23194624 -0.20889751
  -0.24814784 -0.25329773  1.63313575 -0.41775132 -0.46960633  2.21203417]
 [-0.80465056 -1.44465959 -1.46303278 -1.40594958 -0.34556696 -0.41207404
  -0.14424632 -0.29699294 -0.23982

# 二、單一分類器

### 決策分類樹

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3450
           1       0.96      0.93      0.95      1050

    accuracy                           0.98      4500
   macro avg       0.97      0.96      0.97      4500
weighted avg       0.98      0.98      0.98      4500



### KNN

In [10]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      3450
           1       0.91      0.92      0.92      1050

    accuracy                           0.96      4500
   macro avg       0.94      0.95      0.95      4500
weighted avg       0.96      0.96      0.96      4500



### SVC

In [11]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.96      0.97      3450
           1       0.88      0.90      0.89      1050

    accuracy                           0.95      4500
   macro avg       0.93      0.93      0.93      4500
weighted avg       0.95      0.95      0.95      4500



# VotingClassifier

In [12]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True) #probability要設成True(同時計算每個分類的機率)，classification_report才能work

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),('svc', clf3)], voting='soft', weights=[3, 1, 1])
eclf.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3450
           1       0.96      0.93      0.95      1050

    accuracy                           0.98      4500
   macro avg       0.97      0.96      0.97      4500
weighted avg       0.98      0.98      0.98      4500

