# Day_047

### 練習時間
請使用不同的資料集，並使用 hyper-parameter search 的方式，看能不能找出最佳的超參數組合

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# load digits dataset
boston = datasets.load_boston()

df_boston = pd.DataFrame(boston.data)
print(df_boston.shape)
df_boston.head()

(506, 13)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [3]:
print(pd.Series(boston.target).value_counts())
print("")
print(f"Unique values in boston.target = {np.unique(boston.target)}")

50.0    16
25.0     8
23.1     7
21.7     7
22.0     7
20.6     6
19.4     6
20.1     5
19.6     5
19.3     5
15.6     5
21.4     5
13.8     5
21.2     5
20.0     5
17.8     5
23.9     5
22.6     5
22.2     5
24.4     4
23.2     4
23.8     4
19.1     4
20.3     4
13.4     4
19.9     4
27.5     4
22.8     4
22.9     4
18.9     4
        ..
12.3     1
37.6     1
29.4     1
12.6     1
22.1     1
31.1     1
48.3     1
26.2     1
11.0     1
9.5      1
12.0     1
17.7     1
44.8     1
35.1     1
32.4     1
42.3     1
28.2     1
31.2     1
33.3     1
37.2     1
28.6     1
15.1     1
17.3     1
6.3      1
39.8     1
12.8     1
29.9     1
9.6      1
36.1     1
13.0     1
Length: 229, dtype: int64

Unique values in boston.target = [  5.    5.6   6.3   7.    7.2   7.4   7.5   8.1   8.3   8.4   8.5   8.7
   8.8   9.5   9.6   9.7  10.2  10.4  10.5  10.8  10.9  11.   11.3  11.5
  11.7  11.8  11.9  12.   12.1  12.3  12.5  12.6  12.7  12.8  13.   13.1
  13.2  13.3  13.4  13.5  13.6  13.8  13.9  14.   

In [4]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.25, random_state = 42)

# 建立模型
GBR = GradientBoostingRegressor(random_state = 0)

# 訓練模型
GBR.fit(x_train, y_train)

# 預測測試集
y_pred = GBR.predict(x_test)

In [5]:
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) # 預測值與實際值的差距，使用 MSE

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 8.33
Variance score: 0.88


In [6]:
# 設定要訓練的超參數組合
n_estimators = [50, 100, 150]
max_depth = [1, 3, 5, 10, 15, 20]
min_samples_split = [2, 3, 4]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth,min_samples_split = min_samples_split )

# 建立搜尋物件，放入模型及參數組合字典 (n_jobs = -1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(GBR, param_grid, scoring = "neg_mean_squared_error", n_jobs = -1, verbose = 1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 預設會跑 3-fold cross-validadtion，總共 9 種參數組合，總共要 train 27 次模型

Fitting 3 folds for each of 54 candidates, totalling 162 fits


[Parallel(n_jobs=-1)]: Done  85 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:    7.6s finished


In [7]:
# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best Accuracy: -12.733388 using {'max_depth': 3, 'min_samples_split': 3, 'n_estimators': 150}


In [8]:
# 使用最佳參數重新建立模型
GBR_bestparam = GradientBoostingRegressor(max_depth = grid_result.best_params_['max_depth'],
                                          n_estimators = grid_result.best_params_['n_estimators'],
                                          min_samples_split = grid_result.best_params_["min_samples_split"],
                                          random_state = 0)

# 訓練模型
GBR_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = GBR_bestparam.predict(x_test)

In [9]:
print(f"After adjusting Parameter Mean squared error: %.2f" % mean_squared_error(y_test, y_pred)) # 預測值與實際值的差距，使用 MSE

# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

After adjusting Parameter Mean squared error: 8.16
Variance score: 0.88
