# 一、資料匯入及預處理

### 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25) #設定pandas最多顯示出25個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
8721,0.62,0.65,4,212,3,1,0,0,sales,medium
4679,0.69,0.98,3,214,2,0,0,0,sales,low
5632,0.3,0.48,2,104,2,0,0,0,technical,high
349,0.4,0.46,2,149,3,0,1,0,technical,medium
12464,0.82,0.98,5,234,5,0,1,0,marketing,medium


### One-hot Encoding

In [2]:
#針對工作職稱進行標籤化
df_job = pd.get_dummies(df['dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
8709,0,0,0,0,0,1,0,0,0,0
1896,1,0,0,0,0,0,0,0,0,0
3051,0,1,0,0,0,0,0,0,0,0
9287,0,0,0,0,0,0,0,0,1,0
8439,0,0,0,0,0,0,0,0,0,1


In [3]:
#將標籤化的dataframe與原先的資料進行結合
#使用index對準的結合方式
df_ml = pd.merge(df, df_job,left_index=True,right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
14349,0.74,1.0,4,249,5,0,1,0,IT,low,1,0,0,0,0,0,0,0,0,0
2574,0.26,0.67,2,242,6,0,0,0,technical,low,0,0,0,0,0,0,0,0,0,1
2372,0.71,0.74,3,163,3,1,0,0,marketing,medium,0,0,0,0,0,1,0,0,0,0
14787,0.48,0.78,2,198,2,0,1,0,technical,medium,0,0,0,0,0,0,0,0,0,1
3459,0.63,0.62,5,212,6,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0


### LabelEncoding

In [4]:
#將薪資改以整數形式代表
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
5427,0.68,0.71,5,135,4,1,0,0,marketing,medium,0,0,0,0,0,1,0,0,0,0,2
5361,0.56,0.4,2,255,3,0,0,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2
1568,0.41,0.47,2,145,3,0,1,0,technical,low,0,0,0,0,0,0,0,0,0,1,1
4493,0.68,0.52,5,203,3,0,0,0,IT,low,1,0,0,0,0,0,0,0,0,0,1
6486,0.51,0.83,3,136,3,0,0,0,marketing,high,0,0,0,0,0,1,0,0,0,0,3


In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

In [6]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
print(X_train[:5])
print(y_train[:5])


       satisfaction_level  last_evaluation  number_project  \
14390                0.11             0.85               6   
6959                 0.79             0.84               4   
14885                0.45             0.53               2   
4635                 0.91             0.81               3   
1408                 0.39             0.53               2   

       average_montly_hours  time_spend_company  Work_accident  \
14390                   308                   5              0   
6959                    171                   3              0   
14885                   159                   3              0   
4635                    220                   3              1   
1408                    136                   3              0   

       promotion_last_5years  IT  RandD  accounting  hr  management  \
14390                      0   0      0           0   0           0   
6959                       0   0      0           0   0           0   
14885            

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
print(X_train_std[:5])

[[-2.0031442   0.79405041  1.78351488  2.13525254  1.03004625 -0.41507437
  -0.14458998 -0.29317855 -0.23353795 -0.23308406 -0.22688591 -0.20864908
  -0.24728156 -0.2552085   1.60878059 -0.41586244 -0.47172004  0.63488578]
 [ 0.71378773  0.73543718  0.15694063 -0.60142902 -0.34156331 -0.41507437
  -0.14458998 -0.29317855 -0.23353795 -0.23308406 -0.22688591 -0.20864908
  -0.24728156 -0.2552085   1.60878059 -0.41586244 -0.47172004 -0.93239986]
 [-0.64467823 -1.08157288 -1.46963361 -0.84113836 -0.34156331 -0.41507437
  -0.14458998 -0.29317855 -0.23353795 -0.23308406 -0.22688591 -0.20864908
  -0.24728156 -0.2552085   1.60878059 -0.41586244 -0.47172004 -0.93239986]
 [ 1.19324631  0.5595975  -0.65634649  0.37738409 -0.34156331  2.4092068
  -0.14458998 -0.29317855 -0.23353795 -0.23308406 -0.22688591 -0.20864908
  -0.24728156 -0.2552085  -0.6215888   2.40464129 -0.47172004 -0.93239986]
 [-0.88440752 -1.08157288 -1.46963361 -1.30058125 -0.34156331 -0.41507437
  -0.14458998 -0.29317855 -0.233537

# 二、單一分類器

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3423
           1       0.97      0.93      0.95      1077

    accuracy                           0.98      4500
   macro avg       0.97      0.96      0.97      4500
weighted avg       0.98      0.98      0.98      4500



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      3423
           1       0.93      0.92      0.93      1077

    accuracy                           0.96      4500
   macro avg       0.95      0.95      0.95      4500
weighted avg       0.96      0.96      0.96      4500



### SVC

In [10]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3423
           1       0.90      0.90      0.90      1077

    accuracy                           0.95      4500
   macro avg       0.93      0.94      0.93      4500
weighted avg       0.95      0.95      0.95      4500



# Bagging

### 註: 基於決策樹的學習演算法可以不用標準化沒關係，而且訓練速度通常較快

#### OOB

In [11]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100, oob_score=True)
bagc.fit(X,y)
print("oob_score(accuary):",bagc.oob_score_)

oob_score(accuary): 0.9904660310687379


In [12]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train, y_train)
print(metrics.classification_report(y_test, bagc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3423
           1       0.99      0.97      0.98      1077

    accuracy                           0.99      4500
   macro avg       0.99      0.98      0.99      4500
weighted avg       0.99      0.99      0.99      4500

