# 1. 資料匯入

In [14]:
import pandas as pd
pd.set_option("display.max_columns",10) #設定pandas最多顯示出10個欄位資訊
df = pd.read_csv("HR_comma_sep.csv",encoding = "big5") #15000筆
df.head()
# 資料來源：https://www.kaggle.com/ludobenistant/hr-analytics¶

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,dept,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


# 2.預處理

In [15]:
#missing data
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
salary,0,0.0
dept,0,0.0
promotion_last_5years,0,0.0
left,0,0.0
Work_accident,0,0.0
time_spend_company,0,0.0
average_montly_hours,0,0.0
number_project,0,0.0
last_evaluation,0,0.0
satisfaction_level,0,0.0


## One-hot Encoding

In [16]:
df_dept = pd.get_dummies(df['dept'])  # 哪一個部門工作 
df_dept.sample(5)

Unnamed: 0,IT,RandD,accounting,hr,management,marketing,product_mng,sales,support,technical
6445,0,0,0,0,0,0,0,0,0,1
4029,1,0,0,0,0,0,0,0,0,0
7717,0,0,0,0,0,0,0,1,0,0
14164,0,0,0,0,0,0,0,1,0,0
6434,0,0,0,0,0,0,0,1,0,0


In [17]:
df_ml = pd.merge(df, df_dept,left_index=True,right_index=True)  # 合併
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,...,marketing,product_mng,sales,support,technical
2610,0.78,0.5,4,212,2,...,0,0,1,0,0
12388,0.45,0.5,2,157,3,...,1,0,0,0,0
12383,0.38,0.51,2,159,3,...,0,0,1,0,0
8786,0.98,0.62,3,140,4,...,1,0,0,0,0
12807,0.2,0.97,4,237,5,...,0,0,0,0,0


### LabelEncoding

In [18]:
df_ml['salary_rank'] = df_ml['salary'].replace({'low':1,'medium':2,'high':3})    # 將薪水低, 中, 高轉換成數字1, 2, 3
df_ml.sample(5)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,...,product_mng,sales,support,technical,salary_rank
345,0.1,0.86,6,278,4,...,0,0,0,1,3
3117,0.75,0.96,3,138,2,...,0,0,0,0,2
8539,0.73,0.83,4,241,3,...,0,0,0,0,2
5864,0.8,0.87,4,217,3,...,0,0,0,0,2
8219,0.9,0.83,3,273,4,...,0,0,1,0,2


In [19]:
from sklearn.model_selection import train_test_split

X = df_ml[['satisfaction_level','last_evaluation','number_project','average_montly_hours','time_spend_company','Work_accident','promotion_last_5years','IT','RandD','accounting','hr','management','marketing','product_mng','sales','support','technical','salary_rank']]
y = df_ml['left']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

In [20]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# 2. 單一分類器

### 2.1 單純貝式分析

In [21]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print(metrics.classification_report(y_test, gnb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.90      0.73      0.80      3449
          1       0.45      0.74      0.56      1051

avg / total       0.80      0.73      0.75      4500



### 2.2 Decision tree

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

tree = DecisionTreeClassifier(criterion='gini',max_depth=5)
tree.fit(X_train, y_train)
print(metrics.classification_report(y_test, tree.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98      3449
          1       0.97      0.92      0.95      1051

avg / total       0.98      0.98      0.98      4500



### 2.3 LogisticRegression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

lr = LogisticRegression()
lr.fit(X_train, y_train)
print(metrics.classification_report(y_test, lr.predict(X_test)))

             precision    recall  f1-score   support

          0       0.83      0.92      0.87      3449
          1       0.59      0.39      0.47      1051

avg / total       0.77      0.79      0.78      4500



### 2.4 KNN

In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)
print(metrics.classification_report(y_test, knn.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.98      0.97      3449
          1       0.92      0.90      0.91      1051

avg / total       0.96      0.96      0.96      4500



### 2.5 SVC

In [25]:
from sklearn.svm import SVC

svc = SVC(C=1.0, kernel="rbf")
svc.fit(X_train, y_train)
print(metrics.classification_report(y_test, svc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.97      0.97      3449
          1       0.89      0.89      0.89      1051

avg / total       0.95      0.95      0.95      4500



# 3. VotingClassifier

In [26]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

clf1 = DecisionTreeClassifier(max_depth=5)
clf2 = KNeighborsClassifier(n_neighbors=2)
clf3 = SVC(kernel='rbf',probability=True) #probability要設成True(同時計算每個分類的機率)，classification_report才能work

eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),('svc', clf3)], voting='soft', weights=[2, 1, 1])  # weights可以調整
eclf.fit(X_train, y_train)
print(metrics.classification_report(y_test, eclf.predict(X_test)))

             precision    recall  f1-score   support

          0       0.98      0.99      0.98      3449
          1       0.97      0.93      0.95      1051

avg / total       0.97      0.98      0.97      4500



  if diff:


# 4. Bagging

In [27]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100)
bagc.fit(X_train, y_train)
print(metrics.classification_report(y_test, bagc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      3449
          1       0.99      0.97      0.98      1051

avg / total       0.99      0.99      0.99      4500



#### OOB

In [28]:
from sklearn.ensemble import BaggingClassifier

bagc = BaggingClassifier(n_estimators=100, oob_score=True)   # 100顆tree
bagc.fit(X,y)
print("oob_score(accuary):",bagc.oob_score_)

oob_score(accuary): 0.99093272884859


# 5. 隨機森林(Random Forest)

In [29]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100,  criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X_train,y_train)
print(metrics.classification_report(y_test, rfc.predict(X_test)))

             precision    recall  f1-score   support

          0       0.99      1.00      1.00      3449
          1       1.00      0.97      0.98      1051

avg / total       0.99      0.99      0.99      4500



#### OOB

In [30]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, criterion='gini', max_features='auto', oob_score=True)
rfc.fit(X,y)
print("oob_score(accuary):",rfc.oob_score_)

oob_score(accuary): 0.9923994932995533


# 6. AdaBoost

In [31]:
from sklearn.ensemble import AdaBoostClassifier
adb = AdaBoostClassifier(n_estimators=100)
adb.fit(X_train, y_train)
print(metrics.classification_report(y_test, adb.predict(X_test)))

             precision    recall  f1-score   support

          0       0.97      0.98      0.97      3449
          1       0.93      0.90      0.92      1051

avg / total       0.96      0.96      0.96      4500



## 7. Stacking

需要安裝 mlxtend: 
請安裝套件 pip install mlxtend

website: http://rasbt.github.io/mlxtend/

StackingClassifier: http://rasbt.github.io/mlxtend/user_guide/classifier/StackingClassifier/

StackingRegressor: http://rasbt.github.io/mlxtend/user_guide/regressor/StackingRegressor/

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier()
clf3 = GaussianNB()
clf4 = LogisticRegression()
meta_clf = SVC()
stacking_clf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4], meta_classifier=meta_clf)

clf1.fit(X_train, y_train)
clf2.fit(X_train, y_train)
clf3.fit(X_train, y_train)
clf4.fit(X_train, y_train)
stacking_clf.fit(X_train, y_train)

print('KNN Score:',clf1.score(X_test, y_test))
print('RF Score:',clf2.score(X_test, y_test))
print('GNB Score:',clf3.score(X_test, y_test))
print('Logistic Score:',clf4.score(X_test, y_test))
print('Stacking Score:',stacking_clf.score(X_test, y_test))

KNN Score: 0.9435555555555556
RF Score: 0.9893333333333333
GNB Score: 0.73
Logistic Score: 0.7928888888888889
Stacking Score: 0.9866666666666667


## 8. XGBoost

需要安裝XGBoost:
請安裝套件 conda install -c anaconda py-xgboost

In [33]:

import xgboost as xgb
xgbc = xgb.XGBClassifier()   # 若是迴歸問題, 則是xgbr = xgb.XGBRegressor()
xgbc.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [34]:
xgbc.score(X_test, y_test)

  if diff:


0.9771111111111112