# SVR

support vector regression

In [14]:
import numpy as np
import random

# ml
from sklearn.model_selection import (
    KFold,
    cross_validate,
    GridSearchCV,
    RepeatedKFold,
    train_test_split,
)
from sklearn.svm import LinearSVR, SVR
from sklearn.multioutput import MultiOutputRegressor, RegressorChain
from sklearn.preprocessing import StandardScaler
from utils.metrics import calculate_metrics, get_ccp_scoring, print_results_table
from utils.datasets import load_and_split_data

In [2]:
data_path = "../../data/processed/rdc_data_cleaned.csv"
X_raw, Y_raw = load_and_split_data(data_path, test_size=0.1, is_split=False)


scaler_x = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_x.fit_transform(X_raw)
y_scaled = scaler_y.fit_transform(Y_raw)

# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

# 打印划分后的数据集大小
print("训练集大小:", len(x_train))
print("测试集大小:", len(x_test))

训练集大小: 350
测试集大小: 39


## LinerSVR

### MultiOutputRegressor wrapped

In [6]:
model_name = "MR_LinerSVR"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    MultiOutputRegressor(LinearSVR(), n_jobs=-1),
    param_grid=[
        {
            "estimator__C": [0.001, 0.1, 1, 10, 100, 1000],
            "estimator__loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
            "estimator__dual": [True, False],
        },
    ],
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/joblib/_parallel_backends.py", line 273, in _wrap_func_call
    return func()
  File "/usr/local/lib/python3.10/site-packages/joblib/parallel.py", line 589, in __call__
    return [func(*args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/joblib/parallel.py", line 589, in <listcomp>
    return [func(*args, **kwargs)
  File "/usr/local/lib/python3.10/site-packages/sklearn/utils/parallel.py", line 123, in __call__
    return self.function(*args, **kwargs)
  Fil

Best Parameters for MR_LinerSVR : {'estimator__C': 0.1, 'estimator__dual': True, 'estimator__loss': 'epsilon_insensitive'}
Best Score for MR_LinerSVR : -0.2156492085279084




### RegressorChain wrapped

In [13]:
model_name = "RC_LinerSVR"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    RegressorChain(LinearSVR(), order="random"),
    param_grid=[
        {
            "base_estimator__C": [0.001, 0.1, 1, 10, 100, 1000],
            "base_estimator__loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
            "base_estimator__dual": [True, False],
        },
    ],
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 10 folds for each of 24 candidates, totalling 240 fits


60 fits failed out of a total of 240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/multioutput.py", line 1007, in fit
    super().fit(X, Y, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/multioutput.py", line 632, in fit
    estimator.fit(X_aug[:, : (X.shape[1] + chain_idx)], y, **fit_params)
  File "/usr/local/lib/python3.10/site-packages/sklearn/svm/_classes.py", line 518, in fit
    self.coef_, self.intercept_, n_it

Best Parameters for RC_LinerSVR : {'base_estimator__C': 0.1, 'base_estimator__dual': True, 'base_estimator__loss': 'epsilon_insensitive'}
Best Score for RC_LinerSVR : -0.21604322438853493




## SVR

### MultiOutputRegressor wrapped

In [16]:
model_name = "MR_SVR"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    MultiOutputRegressor(SVR(), n_jobs=-1),
    param_grid=[
        {
            "estimator__kernel": ["poly", "rbf", "sigmoid"],
            "estimator__degree": range(1, 5),
            "estimator__C": [0.001, 0.1, 1, 10, 100, 1000],
            "estimator__epsilon": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        },
    ],
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 10 folds for each of 504 candidates, totalling 5040 fits


Best Parameters for MR_SVR : {'estimator__C': 100, 'estimator__degree': 2, 'estimator__epsilon': 0.01, 'estimator__kernel': 'poly'}
Best Score for MR_SVR : -0.13197371766492028




### RegressorChain wrapped

In [17]:
model_name = "RC_SVR"

x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y_scaled, test_size=0.1, random_state=42
)

grid_search = GridSearchCV(
    RegressorChain(SVR(), order="random"),
    param_grid=[
        {
            "base_estimator__kernel": ["poly", "rbf", "sigmoid"],
            "base_estimator__degree": range(1, 5),
            "base_estimator__C": [0.001, 0.1, 1, 10, 100, 1000],
            "base_estimator__epsilon": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        },
    ],
    scoring="neg_mean_absolute_error",
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(x_train, y_train)
calculate_metrics(
    scaler_y.inverse_transform(grid_search.best_estimator_.predict(x_test)),
    scaler_y.inverse_transform(y_test),
    print_metrics=True,
    title=model_name,
)
print("Best Parameters for", model_name, ":", grid_search.best_params_)
print("Best Score for", model_name, ":", grid_search.best_score_)
print("=" * 47 + "\n" * 2)

Fitting 10 folds for each of 504 candidates, totalling 5040 fits


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/usr/local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 234, in __call__
    return self._score(
  File "/usr/local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 282, in _score
    return self._sign * self._score_func(y_true, y_pred, **self._kwargs)
  File "/usr/local/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 196, in mean_absolute_error
    y_type, y_true, y_pred, multioutput = _check_reg_targets(
  File "/usr/local/lib/python3.10/site-packages/sklearn/metrics/_regression.py", line 102, in _check_reg_targets
    y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
  File "/usr/local/lib/python3.10/site-packages/sklearn/utils/validation.py", line 921, in check_array
    _assert_all_finite(
  File "/usr/local/lib/python3.10/sit

Best Parameters for RC_SVR : {'base_estimator__C': 10, 'base_estimator__degree': 2, 'base_estimator__epsilon': 0.01, 'base_estimator__kernel': 'poly'}
Best Score for RC_SVR : -0.14155108020622803


