# 一、資料匯入及預處理

### 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25) #設定pandas最多顯示出25個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
1585,0.73,0.93,5,229,5,0,1,0,technical,medium
11314,0.66,0.59,4,179,3,0,0,0,technical,low
12073,0.09,0.87,7,295,4,0,1,0,product_mng,low
7604,0.73,0.52,2,113,5,1,0,0,support,medium
11365,0.75,0.59,5,149,4,0,0,0,sales,medium


### One-hot Encoding

In [2]:
#針對工作職稱進行標籤化
df_job = pd.get_dummies(df['dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
9031,0,0,0,0,0,0,0,1,0,0
7141,0,0,0,0,0,0,0,0,0,1
6427,0,0,0,0,0,0,0,1,0,0
3679,0,0,0,0,0,0,0,1,0,0
3944,0,0,0,0,0,0,0,0,1,0


In [3]:
#將標籤化的dataframe與原先的資料進行結合
#使用index對準的結合方式
df_ml = pd.merge(df, df_job,left_index=True,right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
2100,0.56,0.94,4,215,2,0,0,0,technical,high,0,0,0,0,0,0,0,0,0,1
3484,0.8,0.97,5,271,4,0,0,0,support,low,0,0,0,0,0,0,0,0,1,0
9706,0.51,0.63,3,234,2,0,0,0,sales,high,0,0,0,0,0,0,0,1,0,0
5230,0.81,0.62,3,187,3,0,0,0,technical,low,0,0,0,0,0,0,0,0,0,1
6981,0.79,0.89,3,252,2,0,0,0,hr,medium,0,0,0,1,0,0,0,0,0,0


### LabelEncoding

In [4]:
#將薪資改以整數形式代表
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
13282,0.97,0.5,3,216,3,0,0,0,IT,low,1,0,0,0,0,0,0,0,0,0,1
12040,0.87,0.88,5,269,5,0,1,0,technical,low,0,0,0,0,0,0,0,0,0,1,1
5782,0.85,0.77,5,263,3,0,0,0,support,medium,0,0,0,0,0,0,0,0,1,0,2
12067,0.9,0.98,4,264,6,0,1,0,product_mng,medium,0,0,0,0,0,0,1,0,0,0,2
2332,0.96,0.68,4,162,2,0,0,0,technical,medium,0,0,0,0,0,0,0,0,0,1,2


In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

In [6]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
print(X_train[:5])
print(y_train[:5])


       satisfaction_level  last_evaluation  number_project  \
7023                 0.52             0.61               5   
10657                0.81             0.86               4   
5157                 0.19             0.92               5   
13756                0.58             0.79               3   
7570                 0.79             0.94               4   

       average_montly_hours  time_spend_company  Work_accident  \
7023                    162                   3              0   
10657                   213                   3              0   
5157                    193                   6              0   
13756                   243                   3              1   
7570                    216                   4              0   

       promotion_last_5years  IT  RandD  accounting  hr  management  \
7023                       0   0      0           0   0           0   
10657                      0   0      0           0   0           0   
5157             

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
print(X_train_std[:5])

[[-0.37584744 -0.61708917  0.97040937 -0.77257448 -0.33822883 -0.40636376
  -0.14697495 -0.30002076 -0.23647143 -0.23126141 -0.22595611 -0.20840041
   4.11667445 -0.25478482 -0.61952758 -0.41475897 -0.47277592  2.19579899]
 [ 0.79332046  0.84347279  0.16078381  0.25322773 -0.33822883 -0.40636376
  -0.14697495 -0.30002076 -0.23647143 -0.23126141 -0.22595611 -0.20840041
  -0.24291452 -0.25478482 -0.61952758  2.41103886 -0.47277592 -0.93670914]
 [-1.70627989  1.19400766  0.97040937 -0.14904764  1.70967458 -0.40636376
  -0.14697495 -0.30002076 -0.23647143 -0.23126141 -0.22595611 -0.20840041
  -0.24291452 -0.25478482 -0.61952758 -0.41475897  2.11516694  0.62954493]
 [-0.13395064  0.43451544 -0.64884174  0.8566408  -0.33822883  2.46084936
  -0.14697495 -0.30002076 -0.23647143 -0.23126141 -0.22595611 -0.20840041
  -0.24291452 -0.25478482  1.61413315 -0.41475897 -0.47277592  0.62954493]
 [ 0.71268819  1.31085262  0.16078381  0.31356904  0.34440564 -0.40636376
  -0.14697495 -0.30002076 -0.23647

# 二、單一分類器

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3415
           1       0.96      0.93      0.94      1085

    accuracy                           0.97      4500
   macro avg       0.97      0.96      0.96      4500
weighted avg       0.97      0.97      0.97      4500



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      3415
           1       0.93      0.90      0.91      1085

    accuracy                           0.96      4500
   macro avg       0.95      0.94      0.94      4500
weighted avg       0.96      0.96      0.96      4500



### SVC

In [10]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3415
           1       0.90      0.90      0.90      1085

    accuracy                           0.95      4500
   macro avg       0.94      0.94      0.94      4500
weighted avg       0.95      0.95      0.95      4500



# 隨機森林(Random Forest)

#### OOB

In [11]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X,y)
print("oob_score(accuary):",rfc.oob_score_)

oob_score(accuary): 0.9924661644109607


In [12]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X,y)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3415
           1       1.00      1.00      1.00      1085

    accuracy                           1.00      4500
   macro avg       1.00      1.00      1.00      4500
weighted avg       1.00      1.00      1.00      4500

