## [作業重點]
了解如何使用 Sklearn 中的 hyper-parameter search 找出最佳的超參數

### 作業
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [17]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

import numpy as np
from sklearn import datasets, metrics
digits = datasets.load_digits()
#digits

x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.2, random_state=1234)

#gbdt = GradientBoostingClassifier(learning_rate=0.2, n_estimators=100, max_depth=3, random_state=0)
gbdt = GradientBoostingClassifier()
gbdt.fit(x_train, y_train)
y_pred = gbdt.predict(x_test)
#print(y_pred)

acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print("Misclassified sample: %d" % (y_test != y_pred).sum())

#print("Feature importance: ", gbdt.feature_importances_)

y_pred_ave = gbdt.predict_proba(x_test)
y_pred_ave = np.argmax(y_pred_ave, axis=1)
confmat = metrics.confusion_matrix(y_test, y_pred_ave)
print('target names: ' + str(digits.target_names))
print('confusion matrix:')
print(confmat)

#查看當前超參數的內容:
gbdt.get_params()

Acuuracy:  0.9722222222222222
Misclassified sample: 10
target names: [0 1 2 3 4 5 6 7 8 9]
confusion matrix:
[[28  0  0  0  0  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  0  0]
 [ 0  0 37  0  0  0  0  0  1  0]
 [ 0  0  0 35  0  0  0  0  1  0]
 [ 0  0  0  0 42  1  0  1  0  0]
 [ 0  0  0  0  0 41  0  0  0  0]
 [ 0  0  0  0  0  0 32  0  1  0]
 [ 0  0  0  0  1  0  0 33  0  0]
 [ 0  1  0  0  0  1  0  0 30  0]
 [ 0  0  0  0  0  0  0  1  1 32]]


{'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'deviance',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'presort': 'auto',
 'random_state': None,
 'subsample': 1.0,
 'verbose': 0,
 'warm_start': False}

### 設定要訓練的超參數組合

In [21]:
 param_grid={
 'learning_rate': [0.1, 0.2, 0.3],
 'max_depth': [3,4,5],
 #'max_features': None,
 #'min_samples_leaf': [2, 3],
 #'min_samples_split': [2, 3, 4],
 'n_estimators': [100, 200, 300],
 }


## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(estimator=gbdt, param_grid=param_grid, scoring="neg_mean_squared_error", cv=3, n_jobs=-1, verbose=2)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 4200 種參數組合，總共要 train  12600次模型

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  81 out of  81 | elapsed:  2.1min finished


In [22]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -0.794711 using {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}


In [23]:
# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(learning_rate=grid_result.best_params_['learning_rate'],
                                          max_depth=grid_result.best_params_['max_depth'],
                                          n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)

#print(y_pred)

acc = metrics.accuracy_score(y_test, y_pred)
print("Acuuracy: ", acc)
print("Misclassified sample: %d" % (y_test != y_pred).sum())

#print("Feature importance: ", gbdt.feature_importances_)

y_pred_ave = clf_bestparam.predict_proba(x_test)
y_pred_ave = np.argmax(y_pred_ave, axis=1)
confmat = metrics.confusion_matrix(y_test, y_pred_ave)
print('target names: ' + str(digits.target_names))
print('confusion matrix:')
print(confmat)

#查看最佳超參數的內容:
#clf_bestparam.get_params()

Acuuracy:  0.9805555555555555
Misclassified sample: 7
target names: [0 1 2 3 4 5 6 7 8 9]
confusion matrix:
[[28  0  0  0  0  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  0  0]
 [ 0  0 38  0  0  0  0  0  0  0]
 [ 0  0  0 36  0  0  0  0  0  0]
 [ 0  0  0  0 42  1  0  1  0  0]
 [ 0  0  0  0  0 41  0  0  0  0]
 [ 0  0  0  0  0  0 32  0  1  0]
 [ 0  0  0  0  1  0  0 33  0  0]
 [ 0  1  0  0  0  1  0  0 30  0]
 [ 0  0  0  0  0  0  0  1  0 33]]
