# 一、資料匯入及預處理

### 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics

In [1]:
import pandas as pd
pd.set_option("display.max_columns",25) #設定pandas最多顯示出25個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
449,0.81,0.92,5,239,5,0,1,0,product_mng,low
6209,0.48,0.87,3,221,2,0,0,0,sales,medium
1171,0.11,0.91,6,302,4,0,1,0,accounting,low
1145,0.37,0.49,2,153,3,0,1,0,accounting,low
10172,0.53,0.64,5,281,4,0,0,0,sales,high


### One-hot Encoding

In [2]:
#針對工作職稱進行標籤化
df_job = pd.get_dummies(df['dept'])
df_job.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
5031,0,1,0,0,0,0,0,0,0,0
4528,0,0,0,0,0,0,0,1,0,0
11872,0,0,0,0,0,0,0,0,1,0
5236,0,0,0,0,0,0,0,0,1,0
12249,0,0,0,0,0,0,0,1,0,0


In [3]:
#將標籤化的dataframe與原先的資料進行結合
#使用index對準的結合方式
df_ml = pd.merge(df, df_job,left_index=True,right_index=True)
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
5313,0.55,0.58,5,262,3,0,0,0,support,medium,0,0,0,0,0,0,0,0,1,0
3117,0.75,0.96,3,138,2,0,0,0,IT,medium,1,0,0,0,0,0,0,0,0,0
14886,0.42,0.47,2,135,3,0,1,0,sales,low,0,0,0,0,0,0,0,1,0,0
10683,0.73,0.48,4,139,2,0,0,0,RandD,low,0,1,0,0,0,0,0,0,0,0
5158,0.86,0.96,4,167,3,0,0,0,support,high,0,0,0,0,0,0,0,0,1,0


### LabelEncoding

In [4]:
#將薪資改以整數形式代表
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical,salary_rank
1314,0.4,0.52,2,136,3,0,1,0,sales,medium,0,0,0,0,0,0,0,1,0,0,2
3349,0.93,0.67,5,237,4,0,0,0,product_mng,low,0,0,0,0,0,0,1,0,0,0,1
11785,0.53,0.91,4,167,4,0,0,0,management,high,0,0,0,0,1,0,0,0,0,0,3
14172,0.9,0.89,4,254,7,0,0,0,product_mng,low,0,0,0,0,0,0,1,0,0,0,1
7475,0.75,0.57,3,158,2,1,0,0,RandD,low,0,1,0,0,0,0,0,0,0,0,1


In [5]:
df_ml = df_ml.dropna().reset_index(drop=True)

In [6]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)
print(X_train[:5])
print(y_train[:5])


      satisfaction_level  last_evaluation  number_project  \
7341                0.95             0.65               3   
6328                0.60             0.76               5   
7041                0.59             0.82               4   
6449                0.88             0.99               3   
4616                0.65             0.53               5   

      average_montly_hours  time_spend_company  Work_accident  \
7341                   155                   2              1   
6328                   269                   2              0   
7041                   203                   4              1   
6449                   190                   5              0   
4616                   205                   3              1   

      promotion_last_5years  IT  RandD  accounting  hr  management  marketing  \
7341                      0   0      0           0   0           0          0   
6328                      0   1      0           0   0           0          0   

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
print(X_train_std[:5])

[[ 1.36455661 -0.39212331 -0.65609195 -0.91202756 -1.03092802  2.43141606
  -0.1439019  -0.3003978  -0.23759207 -0.22919714 -0.22804387 -0.21062969
  -0.25030346 -0.24814784  1.61951948 -0.4166499  -0.47021052  0.64174241]
 [-0.04266884  0.25018474  0.97215751  1.36340449 -1.03092802 -0.41128296
  -0.1439019   3.32891921 -0.23759207 -0.22919714 -0.22804387 -0.21062969
  -0.25030346 -0.24814784 -0.6174671  -0.4166499  -0.47021052  0.64174241]
 [-0.08287528  0.60053458  0.15803278  0.04604909  0.33587502  2.43141606
  -0.1439019  -0.3003978  -0.23759207 -0.22919714 -0.22804387 -0.21062969
  -0.25030346 -0.24814784  1.61951948 -0.4166499  -0.47021052 -0.9335799 ]
 [ 1.08311152  1.59319246 -0.65609195 -0.21343     1.01927654 -0.41128296
  -0.1439019  -0.3003978  -0.23759207 -0.22919714 -0.22804387 -0.21062969
  -0.25030346 -0.24814784 -0.6174671  -0.4166499   2.12670701  2.21706471]
 [ 0.15836337 -1.09282299  0.97215751  0.08596895 -0.3475265   2.43141606
  -0.1439019  -0.3003978  -0.23759

# 二、單一分類器

### 決策分類樹

In [8]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      3448
           1       0.96      0.93      0.95      1052

    accuracy                           0.97      4500
   macro avg       0.97      0.96      0.96      4500
weighted avg       0.97      0.97      0.97      4500



### KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      3448
           1       0.92      0.91      0.92      1052

    accuracy                           0.96      4500
   macro avg       0.95      0.94      0.94      4500
weighted avg       0.96      0.96      0.96      4500



### SVC

In [10]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train_std, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test_std)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3448
           1       0.89      0.89      0.89      1052

    accuracy                           0.95      4500
   macro avg       0.93      0.93      0.93      4500
weighted avg       0.95      0.95      0.95      4500



# AdaBoost

In [11]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)
print(metrics.classification_report(y_test, adb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      3448
           1       0.91      0.91      0.91      1052

    accuracy                           0.96      4500
   macro avg       0.94      0.94      0.94      4500
weighted avg       0.96      0.96      0.96      4500

