# Grid Search for XGBoost model

this notebook is used to find the best parameters for XGBoost model

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import (
    KFold,
    cross_val_score,
    train_test_split,
    GridSearchCV,
)
from utils.metrics import calculate_metrics, get_ccp_scoring
from utils.datasets import load_and_split_data

# ml
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor, XGBRFRegressor

In [None]:
# 设置显示中文字体

from pylab import mpl

mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False

In [3]:
data_path = "../../data/processed/rdc_data_cleaned.csv"
X_raw, Y_raw = load_and_split_data(data_path, test_size=0.1, is_split=False)


scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_x.fit_transform(X_raw)
y_scaled = scaler_y.fit_transform(Y_raw)

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

# 打印划分后的数据集大小
print("训练集大小:", len(x_train))
print("测试集大小:", len(x_test))

训练集大小: 350
测试集大小: 39


In [9]:
model_name = "XGBoost_scaled"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    XGBRegressor(n_jobs=-1),
    param_grid={
        # ------------------------ Primary ----------------------- #
        # Step size shrinkage used in boosting (Typical range: 0.01-0.3)
        "learning_rate": [0.1],
        # Maximum depth of each tree (Typical range: 3-10)
        "max_depth": range(3, 10, 2),
        # Number of boosting rounds (Typical range: 100-1000)
        "n_estimators": range(100, 1000, 100),
        # Minimum sum of instance weight needed in a child (Typical range: 1-10)
        "min_child_weight": range(1, 10, 2),
        # -------------------------------------------------------- #
        # # Subsample ratio of the training instances (Typical range: 0.6-1.0)
        # "subsample": [0.3, 0.5, 0.55],
        # # Subsample ratio of columns when constructing each tree (Typical range: 0.6-1.0)
        # "colsample_bytree": [1.0],
        # # ----------------------- Secondary ---------------------- #
        # # Minimum loss reduction required to make a further partition on a leaf node (Typical range: 0-0.5)
        # "gamma": [0.3, 0.1, 0.5],
        # # L1 regularization term on weights (Typical range: 0-0.1)
        # "reg_alpha": [0.1],
        # # L2 regularization term on weights (Typical range: 0-0.1)
        # "reg_lambda": [0.07, 0.1],
    },
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 5 folds for each of 180 candidates, totalling 900 fits


Best Parameters for XGBoost_scaled : {'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 9, 'n_estimators': 900}
Best Score for XGBoost_scaled : -0.16530791756018723




In [10]:
model_name = "XGBoostRF_scaled"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    XGBRFRegressor(n_jobs=-1),
    param_grid={
         # ------------------------ Primary ----------------------- #
        # Step size shrinkage used in boosting (Typical range: 0.01-0.3)
        "learning_rate": [0.1],
        # Maximum depth of each tree (Typical range: 3-10)
        "max_depth": range(3, 10, 2),
        # Number of boosting rounds (Typical range: 100-1000)
        "n_estimators": range(100, 1000, 100),
        # Minimum sum of instance weight needed in a child (Typical range: 1-10)
        "min_child_weight": range(1, 10, 2),
        # -------------------------------------------------------- #
        # # Subsample ratio of the training instances (Typical range: 0.6-1.0)
        # "subsample": [0.3, 0.5, 0.55],
        # # Subsample ratio of columns when constructing each tree (Typical range: 0.6-1.0)
        # "colsample_bytree": [1.0],
        # # ----------------------- Secondary ---------------------- #
        # # Minimum loss reduction required to make a further partition on a leaf node (Typical range: 0-0.5)
        # "gamma": [0.3, 0.1, 0.5],
        # # L1 regularization term on weights (Typical range: 0-0.1)
        # "reg_alpha": [0.1],
        # # L2 regularization term on weights (Typical range: 0-0.1)
        # "reg_lambda": [0.07, 0.1],
    },
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 5 folds for each of 180 candidates, totalling 900 fits




Best Parameters for XGBoostRF_scaled : {'learning_rate': 0.1, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 100}
Best Score for XGBoostRF_scaled : -0.8431200772536507


