# Grid Search for XGBoost model

this notebook is used to find the best parameters for XGBoost model

In [2]:
import os

os.chdir("/root/workspace/CCP/")

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: '/root/workspace/CCP/'

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    train_test_split,
    GridSearchCV,
)
from utils.metrics import calculate_metrics, get_ccp_scoring
from utils.datasets import load_and_split_data

# ml
from xgboost import XGBRegressor, XGBRFRegressor

In [None]:
# 设置显示中文字体

from pylab import mpl

mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False

## 数据

### 读取数据

In [3]:
data_path = "../../data/processed/rdc_data_cleaned.csv"
x_train, x_test, y_train, y_test = load_and_split_data(
    data_path, test_size=0.1, random_state=42
)

# 打印划分后的数据集大小
print("训练集大小:", len(x_train))
print("测试集大小:", len(x_test))

训练集大小: 350
测试集大小: 39


## 网格搜索

### 定义模型类型和参数网格

使用默认的参数网格

In [4]:
def frange(a, b, n, int_flag=False):
    """生成从a到b,等差数列,共n个数，可以选择是否近似到整数,小数限制小数点后3位"""
    if int_flag:
        return np.linspace(a, b, n, endpoint=True).round(0).astype(int)
    else:
        return np.linspace(a, b, n, endpoint=True).round(3)

frange(0.001, 10, 100)

array([1.000e-03, 1.020e-01, 2.030e-01, 3.040e-01, 4.050e-01, 5.060e-01,
       6.070e-01, 7.080e-01, 8.090e-01, 9.100e-01, 1.011e+00, 1.112e+00,
       1.213e+00, 1.314e+00, 1.415e+00, 1.516e+00, 1.617e+00, 1.718e+00,
       1.819e+00, 1.920e+00, 2.021e+00, 2.122e+00, 2.223e+00, 2.324e+00,
       2.425e+00, 2.526e+00, 2.627e+00, 2.728e+00, 2.829e+00, 2.930e+00,
       3.031e+00, 3.132e+00, 3.233e+00, 3.334e+00, 3.435e+00, 3.536e+00,
       3.637e+00, 3.738e+00, 3.839e+00, 3.940e+00, 4.041e+00, 4.142e+00,
       4.243e+00, 4.344e+00, 4.445e+00, 4.546e+00, 4.647e+00, 4.748e+00,
       4.849e+00, 4.950e+00, 5.051e+00, 5.152e+00, 5.253e+00, 5.354e+00,
       5.455e+00, 5.556e+00, 5.657e+00, 5.758e+00, 5.859e+00, 5.960e+00,
       6.061e+00, 6.162e+00, 6.263e+00, 6.364e+00, 6.465e+00, 6.566e+00,
       6.667e+00, 6.768e+00, 6.869e+00, 6.970e+00, 7.071e+00, 7.172e+00,
       7.273e+00, 7.374e+00, 7.475e+00, 7.576e+00, 7.677e+00, 7.778e+00,
       7.879e+00, 7.980e+00, 8.081e+00, 8.182e+00, 

In [5]:
# 创建模型和参数网格
_basic_model = XGBRegressor()
param_grid = {
    # ------------------------ Primary ----------------------- #
    # Step size shrinkage used in boosting (Typical range: 0.01-0.3)
    "learning_rate": [0.0001, 0.01, 0.05, 0.1, 0.3],
    # Number of boosting rounds (Typical range: 100-1000)
    "n_estimators": [100, 200, 500, 1000, 3000],
    # Maximum depth of each tree (Typical range: 3-10)
    "max_depth": [3, 6, 10],
    # Subsample ratio of the training instances (Typical range: 0.6-1.0)
    "subsample": [0.6, 0.8, 1.0],
    # Subsample ratio of columns when constructing each tree (Typical range: 0.6-1.0)
    "colsample_bytree": [0.6, 0.8, 1.0],
    # ----------------------- Secondary ---------------------- #
    # Minimum sum of instance weight needed in a child (Typical range: 1-10)
    "min_child_weight": [1, 5],
    # Minimum loss reduction required to make a further partition on a leaf node (Typical range: 0-0.5)
    "gamma": [0, 1.0],
    # L1 regularization term on weights (Typical range: 0-0.1)
    "reg_alpha": [0, 0.1],
    # L2 regularization term on weights (Typical range: 0-0.1)
    "reg_lambda": [0, 0.1],
}

### 交叉验证策略

In [6]:
_cv = KFold(n_splits=5, shuffle=True, random_state=42)

### 网格搜索

In [7]:
# 创建GridSearchCV对象并执行网格搜索
grid_search = GridSearchCV(
    _basic_model,
    param_grid,
    scoring=get_ccp_scoring(),
    refit="score",
    cv=_cv,
    verbose=1,
    n_jobs=-1,
)

In [8]:
# 拟合GridSearchCV对象
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 10800 candidates, totalling 54000 fits




### 结果

In [22]:
# 打印最佳参数组合和得分
print("Best Parameters: ", grid_search.best_params_)
print("Best Score: ", grid_search.best_score_)
grid_search.best_params_

Best Parameters:  {'colsample_bytree': 1.0, 'gamma': 1.0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 3000, 'reg_alpha': 0.1, 'reg_lambda': 0, 'subsample': 0.6}
Best Score:  -104.16285803623501


{'colsample_bytree': 1.0,
 'gamma': 1.0,
 'learning_rate': 0.01,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 3000,
 'reg_alpha': 0.1,
 'reg_lambda': 0,
 'subsample': 0.6}

In [14]:
grid_search.cv_results_

{'mean_fit_time': array([0.29983206, 0.60419745, 0.90982456, 1.23454957, 1.54573689,
        1.86714492, 2.20739698, 2.52287426, 2.85685349, 3.17332649,
        0.30480781, 0.63106608, 0.95712075, 1.28961101, 1.63254213,
        1.94636374, 2.2696538 , 2.60490055, 2.90748105, 3.22721157,
        0.31140795, 0.6338273 , 0.95741749, 1.27726212, 1.59444213,
        1.91346574, 2.2360352 , 2.56217761, 2.89889002, 3.19816308,
        0.31133509, 0.64097657, 0.97381692, 1.3032773 , 1.58995824,
        1.92496295, 2.24158654, 2.61477027, 2.92696986, 3.18238883,
        0.33980241, 0.65718632, 0.95115223, 1.26875877, 1.5918292 ,
        1.90763679, 2.22573318, 2.54776111, 2.883531  , 3.20695639,
        0.32618823, 0.62662978, 0.96419649, 1.26605024, 1.59004869,
        1.91005187, 2.2405684 , 2.53964472, 2.85221777, 3.17442451,
        0.3151825 , 0.62778158, 0.94899931, 1.27150984, 1.59421597,
        1.90804362, 2.23317933, 2.55836811, 2.85121703, 3.18031521,
        0.31452332, 0.6249948 ,

In [23]:
calculate_metrics(
    grid_search.best_estimator_.predict(x_test),
    y_test,
    print_metrics=True,
)

Metric		MAE		MAPE		RMSE
-------------------------------------------------------------------------------------
Airflow:		82.5650		7.88%		131.0022
Pres:		3.9791		118.85%		6.4017
------------------


{'MAE': {'Airflow': 82.56504835495579, 'Pres': 3.979109816214977},
 'MAPE': {'Airflow': 0.0788469482149775, 'Pres': 1.1884848854985501},
 'RMSE': {'Airflow': 131.0022233660143, 'Pres': 6.4017492311151205}}

## 调整参数网格（2023年10月6日14点42分）

In [26]:
param_grid = {
    # ------------------------ Primary ----------------------- #
    # Step size shrinkage used in boosting (Typical range: 0.01-0.3)
    "learning_rate": [0.0005, 0.01, 0.02],
    # Number of boosting rounds (Typical range: 100-1000)
    "n_estimators": [2000, 3000, 4000, 5000],
    # Maximum depth of each tree (Typical range: 3-10)
    "max_depth": [3, 4],
    # Subsample ratio of the training instances (Typical range: 0.6-1.0)
    "subsample": [0.5, 0.6, 0.7],
    # Subsample ratio of columns when constructing each tree (Typical range: 0.6-1.0)
    "colsample_bytree": [0.9, 1.0],
    # ----------------------- Secondary ---------------------- #
    # Minimum sum of instance weight needed in a child (Typical range: 1-10)
    "min_child_weight": [5, 8, 10],
    # Minimum loss reduction required to make a further partition on a leaf node (Typical range: 0-0.5)
    "gamma": [0.5, 1.0],
    # L1 regularization term on weights (Typical range: 0-0.1)
    "reg_alpha": [0.05, 0.1],
    # L2 regularization term on weights (Typical range: 0-0.1)
    "reg_lambda": [0, 0.05],
}
grid_search_1 = GridSearchCV(
    _basic_model,
    param_grid,
    scoring=ccp_scoring,
    refit="score",
    cv=_cv,
    verbose=1,
    n_jobs=-1,
)

In [27]:
# 拟合GridSearchCV对象
grid_search_1.fit(x_train, y_train)

Fitting 5 folds for each of 3456 candidates, totalling 17280 fits




In [29]:
# 打印最佳参数组合和得分
print("Best Parameters: ", grid_search_1.best_params_)
print("Best Score: ", grid_search_1.best_score_)
calculate_metrics(
    grid_search_1.best_estimator_.predict(x_test),
    y_test,
    print_metrics=True,
)
grid_search_1.best_params_

Best Parameters:  {'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.02, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 4000, 'reg_alpha': 0.1, 'reg_lambda': 0.05, 'subsample': 0.5}
Best Score:  -100.26277258813198
Metric		MAE		MAPE		RMSE
-------------------------------------------------------------------------------------
Airflow:		80.8499		7.87%		129.7089
Pres:		3.8950		77.56%		6.0188
------------------


{'colsample_bytree': 1.0,
 'gamma': 0.5,
 'learning_rate': 0.02,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 4000,
 'reg_alpha': 0.1,
 'reg_lambda': 0.05,
 'subsample': 0.5}

## 调整参数网格（2023年10月6日16点01分）

In [30]:
param_grid = {
    # ------------------------ Primary ----------------------- #
    # Step size shrinkage used in boosting (Typical range: 0.01-0.3)
    "learning_rate": [0.02, 0.05, 0.1, 0.3],
    # Number of boosting rounds (Typical range: 100-1000)
    "n_estimators": [3500, 4000, 4500],
    # Maximum depth of each tree (Typical range: 3-10)
    "max_depth": [3],
    # Subsample ratio of the training instances (Typical range: 0.6-1.0)
    "subsample": [0.3, 0.5, 0.55],
    # Subsample ratio of columns when constructing each tree (Typical range: 0.6-1.0)
    "colsample_bytree": [1.0],
    # ----------------------- Secondary ---------------------- #
    # Minimum sum of instance weight needed in a child (Typical range: 1-10)
    "min_child_weight": [5],
    # Minimum loss reduction required to make a further partition on a leaf node (Typical range: 0-0.5)
    "gamma": [0.3, 0.1, 0.5],
    # L1 regularization term on weights (Typical range: 0-0.1)
    "reg_alpha": [0.1],
    # L2 regularization term on weights (Typical range: 0-0.1)
    "reg_lambda": [0.07, 0.1],
}
grid_search_2 = GridSearchCV(
    _basic_model,
    param_grid,
    scoring=ccp_scoring,
    refit="score",
    cv=_cv,
    verbose=1,
    n_jobs=-1,
).fit(x_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits


In [31]:
# 打印最佳参数组合和得分
print("Best Parameters: ", grid_search_2.best_params_)
print("Best Score: ", grid_search_2.best_score_)
calculate_metrics(
    grid_search_2.best_estimator_.predict(x_test),
    y_test,
    print_metrics=True,
)
grid_search_2.best_params_

Best Parameters:  {'colsample_bytree': 1.0, 'gamma': 0.3, 'learning_rate': 0.02, 'max_depth': 3, 'min_child_weight': 5, 'n_estimators': 4500, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'subsample': 0.3}
Best Score:  -99.5036189317554
Metric		MAE		MAPE		RMSE
-------------------------------------------------------------------------------------
Airflow:		79.4788		7.68%		131.7325
Pres:		3.7829		98.22%		5.7704
------------------


{'colsample_bytree': 1.0,
 'gamma': 0.3,
 'learning_rate': 0.02,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 4500,
 'reg_alpha': 0.1,
 'reg_lambda': 0.1,
 'subsample': 0.3}