# 载入套件

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

from typing import Dict
from tqdm.auto import tqdm

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.cluster import DBSCAN

import numpy as np
import random
import pandas as pd
import ydata_profiling

import scienceplots
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

import torch
import os

%matplotlib inline

pd.plotting.register_matplotlib_converters()
sns.set_style("whitegrid")
sns.set_palette("RdBu")
sns.set(
    rc={'text.usetex': True},
    font="serif",
    font_scale=1.2
)


In [11]:
df = pd.read_pickle('../Datasets/features_q2.pkl')


# 回归问题

In [12]:
cols = [str(i) for i in range(1, 11)]
X = df[cols]
y = df['11']


In [13]:
def NMSE(y_pred, y_true):
    return (((y_pred-y_true)**2).sum()/((y_pred-np.mean(y_true))**2).sum())**(1/2)


In [14]:
import itertools

# 準備用於調整的參數【第二次迭代，粒度更细】
param_space = {
    'n_estimators': [10, 20, 30],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1.0, 2.0, 4.0],
}

# 產生超參數 max_depth 與 min_child_weight 的所有組合
param_combinations = itertools.product(param_space['n_estimators'],
                                       param_space['max_depth'],
                                       param_space['min_child_weight'])

# 用 List 保存各參數組合以及各組合的分數
params = []
scores = []

# 對各參數組合的模型進行交叉驗證
# 即网格搜索（Grid Search），也就是超参数的暴力搜索
for n_estimators, max_depth, min_child_weight in param_combinations:

    score_folds = []
    # 進行交叉驗證
    # 將訓練資料分成10分，其中一個作為驗證資料，並不斷輪替交換
    kf = KFold(n_splits=10, shuffle=True, random_state=654321)
    for tr_idx, va_idx in kf.split(X):
        # 將資料分為訓練資料與驗證資料
        tr_x, va_x = X.iloc[tr_idx], X.iloc[va_idx]
        tr_y, va_y = y.iloc[tr_idx], y.iloc[va_idx]

        # 建立 xgboost 模型並進行訓練
        model = XGBRegressor(n_estimators=n_estimators, random_state=123456,
                             max_depth=max_depth, min_child_weight=min_child_weight)
        model.fit(tr_x, tr_y)

        # 驗證資料的預測值與 logloss 評價指標
        va_pred = model.predict(va_x)
        score_folds.append(NMSE(va_y, va_pred))

    # 將各 fold 的評價指標進行平均
    score_mean = np.mean(score_folds)

    # 保存參數的組合以及其相對應的評價指標
    params.append({'n_estimators': n_estimators,
                  'max_depth': max_depth, 'min_child_weight': min_child_weight})
    scores.append(score_mean)

# 找出將評價指標分數最佳的參數組合
best_idx = np.argsort(scores)[0]
best_param = params[best_idx]

print(best_param)


{'n_estimators': 10, 'max_depth': 3, 'min_child_weight': 1.0}


In [15]:
scores


[0.8262052543514997,
 0.8266147390379152,
 0.8270969706492399,
 0.8318504763852129,
 0.8329782408431614,
 0.8341339204701624,
 0.8414697779991567,
 0.8487704263220739,
 0.8432766097182821,
 0.82882919172858,
 0.8301264848450035,
 0.8309663848059545,
 0.8342117858004858,
 0.8381875473660184,
 0.8409516654765035,
 0.8497599851673752,
 0.8570656442839418,
 0.8534541531480535,
 0.832568940728315,
 0.8342777833079049,
 0.8349489578488708,
 0.8387136781865884,
 0.8422540452961046,
 0.84477601972568,
 0.8555486551077054,
 0.862146949316525,
 0.8579868870212246]

# 模型存储

In [16]:
model = XGBRegressor(n_estimators=10,
                     random_state=123456,
                     max_depth=3,
                     min_child_weight=1.0)

model.fit(X, y)


In [17]:
import joblib

joblib.dump(model, "../Datasets/best_model_xg.joblib")


['../Datasets/best_model_xg.joblib']