#  Random Forest Regression

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_absolute_percentage_error as mape
from sklearn.metrics import mean_squared_error as msef
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_validate

from utils.metrics import calculate_metrics, get_ccp_scoring, print_results_table
from utils.datasets import load_and_split_data
from sklearn.preprocessing import StandardScaler
# ml
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [16]:
# 设置显示中文字体

from pylab import mpl

mpl.rcParams["font.sans-serif"] = ["SimHei"]
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False

In [2]:
data_path = "../../data/processed/rdc_data_cleaned.csv"
X_raw, Y_raw = load_and_split_data(data_path, test_size=0.1, is_split=False)


scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_x.fit_transform(X_raw)
y_scaled = scaler_y.fit_transform(Y_raw)

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

# 打印划分后的数据集大小
print("训练集大小:", len(x_train))
print("测试集大小:", len(x_test))

训练集大小: 350
测试集大小: 39


## Single Training

In [4]:
x_train, x_test, y_train, y_test = load_and_split_data(data_path, test_size=0.1)

_model = RandomForestRegressor(criterion="absolute_error").fit(x_train, y_train)
calculate_metrics(_model.predict(x_test), y_test, print_metrics=True)

保存模型

In [7]:
# xgb.save_model("path")

## Grid Search

1. criterion

Gird

In [16]:
model_name = "RF"
x_train, x_test, y_train, y_test = load_and_split_data(data_path, test_size=0.1)
grid_search = GridSearchCV(
    RandomForestRegressor(n_jobs=-1, random_state=42),
    param_grid=[
        {
            "n_estimators": range(70, 90),
            "criterion": ["absolute_error"],
            "max_depth": range(12, 17),
            "min_samples_split": range(1, 5),
            "max_features": [1.0],
        },
    ],
    scoring=get_ccp_scoring(),
    refit="pres_rmse",
    cv=KFold(n_splits=5, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    grid_search.best_estimator_.predict(x_test),
    y_test,
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


500 fits failed out of a total of 2000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
500 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterError(

Best Parameters for RF : {'criterion': 'absolute_error', 'max_depth': 13, 'max_features': 1.0, 'min_samples_split': 2, 'n_estimators': 86}
Best Score for RF : -12.01710970947072


## Grid Search (scaled)

In [4]:
model_name = "RF_scaled"
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)
grid_search = GridSearchCV(
    RandomForestRegressor(n_jobs=-1, random_state=42),
    param_grid=[
        {
            "n_estimators": range(70, 90),
            "criterion": ["absolute_error"],
            "max_depth": range(12, 17),
            "min_samples_split": range(1, 5),
            "min_samples_leaf": range(1, 5),
            "max_features": [1.0],
        },
    ],
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)

Fitting 10 folds for each of 1600 candidates, totalling 16000 fits


4000 fits failed out of a total of 16000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4000 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/ensemble/_forest.py", line 340, in fit
    self._validate_params()
  File "/usr/local/lib/python3.10/site-packages/sklearn/base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterErr

Best Parameters for RF_scaled : {'criterion': 'absolute_error', 'max_depth': 14, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 87}
Best Score for RF_scaled : -0.2505924990153884
