# XGBoost 

In [14]:
import os
os.chdir("/root/workspace/CCP/")

In [15]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as mse
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score

from utils.metrics import calculate_metrics, get_ccp_scoring, print_results_table
from utils.datasets import load_and_split_data

# ml
from xgboost import XGBRegressor, XGBRFRegressor

In [16]:
# 设置显示中文字体

from pylab import mpl

mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False

## 数据

### 读取数据

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split


def load_and_split_data(data_file, test_size=0.1, random_state=42):
    """
    Loads the data from a CSV file, selects the relevant features,
    and splits the dataset into training and testing sets.

    Args:
        data_file (str): The path to the CSV file containing the data.
        test_size (float, optional): The proportion of the dataset to
            include in the testing set. Defaults to 0.1.
        random_state (int, optional): Controls the shuffling applied
            to the data before splitting. Defaults to 42.

    Returns:
        x_train (ndarray): The feature values for the training set.
        x_test (ndarray): The feature values for the testing set.
        y_train (ndarray): The target values for the training set.
        y_test (ndarray): The target values for the testing set.

    """
    _df_raw = pd.read_csv(data_file)

    decision_features = [
        "MAU_FREQ",
        "AHU_FREQ",
        "EF_FREQ",
        "RM1_SUPP_DMPR_0",
        "RM2_SUPP_DMPR_0",
        "RM6_SUPP_DMPR_0",
        "RM6_SUPP_DMPR_1",
        "RM3_SUPP_DMPR_0",
        "RM4_SUPP_DMPR_0",
        "RM5_SUPP_DMPR_0",
        "RM2_RET_DMPR_0",
        "RM6_RET_DMPR_0",
        "RM3_RET_DMPR_0",
        "RM4_RET_DMPR_0",
        "RM5_EXH_DMPR_1",
        "RM3_EXH_DMPR_0",
        "RM4_EXH_DMPR_0",
        "RM5_EXH_DMPR_0",
    ]
    controlled_features = [
        "TOT_FRSH_VOL",
        "TOT_SUPP_VOL",
        "TOT_RET_VOL",
        "TOT_EXH_VOL",
        "RM1_PRES",
        "RM2_PRES",
        "RM3_PRES",
        "RM4_PRES",
        "RM5_PRES",
        "RM6_PRES",
    ]

    _df_data = _df_raw[decision_features]
    _df_label = _df_raw[controlled_features]

    x_train, x_test, y_train, y_test = train_test_split(
        _df_data.values,
        _df_label.values,
        test_size=test_size,
        shuffle=True,
        random_state=random_state,
    )

    return x_train, x_test, y_train, y_test


In [55]:
data_path = "data/rdc_data_cleaned.csv"
x_train, x_test, y_train, y_test = load_and_split_data(
    data_path, test_size=0.1, random_state=42
)

# 打印划分后的数据集大小
print("训练集大小:", len(x_train))
print("测试集大小:", len(x_test))
x_train

训练集大小: 350
测试集大小: 39


array([[27.39, 46.49, 22.36, ..., 90.  , 90.  , 15.  ],
       [27.36, 46.46, 22.36, ..., 90.  , 90.  , 15.  ],
       [27.41, 50.  , 22.38, ..., 90.  , 90.  , 15.  ],
       ...,
       [31.51, 46.45, 22.32, ..., 80.  , 85.  , 20.  ],
       [41.77, 20.89, 50.  , ..., 40.  , 25.  , 20.  ],
       [27.45, 41.34, 22.36, ..., 90.  , 90.  , 15.  ]])

## 单次训练

### XGBoost

随便使用一组参数进行训练

In [17]:
once_xgbRegr = XGBRegressor(
    n_estimators=500, learning_rate=0.05, min_child_weight=5, max_depth=4,
).fit(once_data_train, once_label_train)

calculate_metrics(
    once_xgbRegr.predict(once_data_test), once_label_test, print_metrics=True
)


{'MAE': {'Airflow': 71.69197373756995, 'Pres': 3.9573271577493254},
 'MAPE': {'Airflow': 0.07034244855662888, 'Pres': 1.285285206848656},
 'RMSE': {'Airflow': 112.41522371459511, 'Pres': 6.190046628304861}}

保存模型

In [None]:
# xgb.save_model("path")

### XGBoost RF

随便使用一组参数进行训练

In [18]:
once_xgbRFRegr = XGBRFRegressor(
    n_estimators=500,
    learning_rate=0.995,
    min_child_weight=5,
    max_depth=4,
).fit(once_data_train, once_label_train)

calculate_metrics(
    once_xgbRFRegr.predict(once_data_test), once_label_test, print_metrics=True
)

{'MAE': {'Airflow': 119.15700913942777, 'Pres': 8.733706241219469},
 'MAPE': {'Airflow': 0.10092172034751415, 'Pres': 3.83867485324668},
 'RMSE': {'Airflow': 176.01226336190948, 'Pres': 13.019013071228498}}

保存模型

In [16]:
# xgb.save_model("path")

## 交叉验证

In [26]:
xgbRegr_kfold = XGBRegressor(
    n_estimators=3000,
    learning_rate=0.029,
    min_child_weight=1,
    max_depth=3,
    colsample_bynode=1,
    gamma=0,
    subsample=0.8,
)
fold_metrics = cross_validation_kfold(
    xgbRegr_kfold, df_data.values, df_label.values, eval_method=calculate_metrics, k=5
)

In [28]:
fold_metrics

{0: {'MAE': {'Airflow': 67.1016258125631, 'Pres': 3.597080020049953},
  'MAPE': {'Airflow': 0.10808708777394264, 'Pres': 0.28284571582502743},
  'RMSE': {'Airflow': 105.64022295635996, 'Pres': 5.6154924518800176}},
 1: {'MAE': {'Airflow': 66.76243203423981, 'Pres': 3.5890163936867165},
  'MAPE': {'Airflow': 0.1288482058556604, 'Pres': 0.4379841559455478},
  'RMSE': {'Airflow': 90.01240765318626, 'Pres': 5.242327974893376}},
 2: {'MAE': {'Airflow': 65.87792605897305, 'Pres': 3.7482413500152583},
  'MAPE': {'Airflow': 0.08816133746983042, 'Pres': 0.5124017342855407},
  'RMSE': {'Airflow': 93.44458352982255, 'Pres': 5.594949105911531}},
 3: {'MAE': {'Airflow': 58.73592417659921, 'Pres': 3.5966929816206297},
  'MAPE': {'Airflow': 0.06908901801785684, 'Pres': 0.3355270837157467},
  'RMSE': {'Airflow': 81.6422487275536, 'Pres': 5.241525422415325}},
 4: {'MAE': {'Airflow': 71.99241014753072, 'Pres': 4.539200940539826},
  'MAPE': {'Airflow': 0.0711788539936246, 'Pres': 0.41558167244261107},
  