In [100]:
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [101]:
# Import 需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [102]:
# 讀取資料集

dir_data = './Day_048/data-science-london-scikit-learn'

app_train = pd.read_csv(os.path.join(dir_data, 'train.csv'), header=None)
app_train.head()

app_train_labels = pd.read_csv(os.path.join(dir_data, 'trainLabels.csv'), header=None)
app_train_labels.head()

app_test = pd.read_csv(os.path.join(dir_data, 'test.csv'), header=None)
app_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,2.808909,-0.242894,-0.546421,0.255162,1.749736,-0.030458,-1.322071,3.578071,-0.667578,-0.884257,...,-0.261688,-0.224375,-1.675606,-0.479584,-0.244388,-0.672355,0.51786,0.010665,-0.419214,2.818387
1,-0.374101,0.537669,0.081063,0.756773,0.915231,2.557282,3.703187,1.673835,-0.764122,-1.22804,...,-0.969463,0.574154,-2.200519,-1.61224,0.179031,-2.924596,0.64361,-1.470939,-0.067408,-0.976265
2,-0.08837,0.154743,0.380716,-1.176126,1.699867,-0.258627,-1.384999,1.093584,1.596633,0.230631,...,-0.769885,-0.005143,1.46749,0.483803,-3.542981,0.814561,-1.652948,1.265866,-1.749248,1.773784
3,-0.685635,0.501283,1.873375,0.215224,-3.983468,-0.103637,4.136113,-0.225431,-1.515015,-1.071763,...,0.968609,2.386412,-0.131219,0.285646,2.302069,1.255588,-1.56309,-0.125258,-1.030761,-2.945329
4,0.350867,0.721897,-0.477104,-1.748776,-2.627405,1.075433,4.954253,-3.293501,-0.760369,0.20436,...,0.260553,-2.04565,-2.173227,0.372992,0.4507,-0.211657,1.301359,-0.522164,2.484883,0.039213


In [103]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(app_train, app_train_labels, test_size=0.25, random_state=42)


# 建立模型
clf = GradientBoostingClassifier(random_state=7)

# 先看看使用預設參數得到的結果，約為 1.41 的 MSE
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(metrics.mean_squared_error(y_test, y_pred))

  y = column_or_1d(y, warn=True)


0.12


In [104]:
# 設定要訓練的超參數組合
n_estimators = [10,50, 100, 200]
max_depth = [1, 3, 5, 7, 10]

# 建立 dictionary
param_grid = dict(n_estimators=n_estimators, max_depth=max_depth)

## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
grid_search = GridSearchCV(clf, param_grid, scoring="f1", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
grid_result = grid_search.fit(x_train, y_train)

# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(max_depth=grid_result.best_params_['max_depth'],
                                           n_estimators=grid_result.best_params_['n_estimators'])

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)

# 調整參數後約可降至 0.96 的 MSE
print(metrics.mean_squared_error(y_test, y_pred))

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    4.3s finished
  y = column_or_1d(y, warn=True)


Best Accuracy: 0.872125 using {'max_depth': 5, 'n_estimators': 200}


  y = column_or_1d(y, warn=True)


0.112


In [105]:
# 預測測試集
# app_test_pred = clf_bestparam.predict(app_test)

estimator_pred = clf_bestparam.predict(app_test)

estimator_pred.shape

(9000,)

In [106]:
import pandas as pd

df = pd.DataFrame(estimator_pred)

# The Kaggle needs to find Id=9000, so we have to start with 1
df.index += 1

# First column name of CSV should be 'Id', while the 2nd should be 'Solution'
df.columns = ['Solution']
df.to_csv('./Day_048_GradientBoostClassfier_with_Grid.csv', sep=',', encoding='utf-8', index_label='Id', index=True)

### Use RandomizedSearchCV

In [107]:

from sklearn.model_selection import RandomizedSearchCV


param_dict = {
        'n_estimators':range(10,500,4),
        'max_depth':range(2,15,1),
        'learning_rate':np.linspace(0.001, 0.01, 2, 20),
        'subsample':np.linspace(0.5, 0.95, 20)
        }



## 建立搜尋物件，放入模型及參數組合字典 (n_jobs=-1 會使用全部 cpu 平行運算)
random_search = RandomizedSearchCV(clf, param_dict, scoring="f1", n_jobs=-1, verbose=1)

# 開始搜尋最佳參數
random_result = random_search.fit(x_train, y_train)

# 印出最佳結果與最佳參數
print("Best Accuracy: %f using %s" % (random_result.best_score_, random_result.best_params_))

# 使用最佳參數重新建立模型
clf_bestparam = GradientBoostingClassifier(max_depth=random_result.best_params_['max_depth'],
                                           n_estimators=random_result.best_params_['n_estimators'],
                                           learning_rate=random_result.best_params_['learning_rate'],
                                           subsample=random_result.best_params_['subsample']
                                          )

# 訓練模型
clf_bestparam.fit(x_train, y_train)

# 預測測試集
y_pred = clf_bestparam.predict(x_test)

# 調整參數後約可降至 0.96 的 MSE
print(metrics.mean_squared_error(y_test, y_pred))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  30 | elapsed:    4.8s remaining:    4.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    7.5s finished
  y = column_or_1d(y, warn=True)


Best Accuracy: 0.877255 using {'subsample': 0.8078947368421052, 'n_estimators': 442, 'max_depth': 5, 'learning_rate': 0.01}


  y = column_or_1d(y, warn=True)


0.112


In [108]:
# 預測測試集
# app_test_pred = clf_bestparam.predict(app_test)

estimator_pred = clf_bestparam.predict(app_test)

estimator_pred.shape

(9000,)

In [109]:
import pandas as pd

df = pd.DataFrame(estimator_pred)

# The Kaggle needs to find Id=9000, so we have to start with 1
df.index += 1

# First column name of CSV should be 'Id', while the 2nd should be 'Solution'
df.columns = ['Solution']
df.to_csv('./Day_048_GradientBoostClassfier_with_Random.csv', sep=',', encoding='utf-8', index_label='Id', index=True)