In [1]:
!pip install ray[tune] tune-sklearn

In [1]:
import torch
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingRegressor

data, y = torch.load('data/house_price_v2.pkl')
train_data, test_data = data[:len(y)], data[len(y):]

np.random.seed(42)

std = StandardScaler()
model = BaggingRegressor(base_estimator=SVR())
pipe = Pipeline(
    [('std', std),
     ('model', model)]
)

cross_val_score(pipe, train_data, y, cv=10, scoring=make_scorer(mean_squared_error, greater_is_better=False), n_jobs=-1,
                verbose=1).mean()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    5.2s finished


-0.03807787487348134

In [2]:
# 超参数搜索
from ray import tune
# from ray.tune.sklearn import TuneSearchCV
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingRandomSearchCV
from scipy.stats import randint, loguniform

param = {
    'model__n_estimators': randint(1, 500),
    # 集成器参数，越多模型越复杂
    'model__base_estimator__C': loguniform(1e-2, 100),
    'model__base_estimator__gamma': loguniform(1e-6, 1),
}

optim = HalvingRandomSearchCV(pipe, param, cv=10, n_jobs=-1, verbose=1,
                              scoring=make_scorer(mean_squared_error, greater_is_better=False))
optim.fit(train_data, y)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 20
max_resources_: 1460
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 73
n_resources: 20
Fitting 10 folds for each of 73 candidates, totalling 730 fits
----------
iter: 1
n_candidates: 25
n_resources: 60
Fitting 10 folds for each of 25 candidates, totalling 250 fits
----------
iter: 2
n_candidates: 9
n_resources: 180
Fitting 10 folds for each of 9 candidates, totalling 90 fits
----------
iter: 3
n_candidates: 3
n_resources: 540
Fitting 10 folds for each of 3 candidates, totalling 30 fits


HalvingRandomSearchCV(cv=10,
                      estimator=Pipeline(steps=[('std', StandardScaler()),
                                                ('model',
                                                 BaggingRegressor(base_estimator=SVR()))]),
                      n_jobs=-1,
                      param_distributions={'model__base_estimator__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002335AA53790>,
                                           'model__base_estimator__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002335AA53F40>,
                                           'model__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000023334C5AD60>},
                      scoring=make_scorer(mean_squared_error, greater_is_better=False),
                      verbose=1)

In [7]:
print(optim.best_params_)

{'model__base_estimator__C': 59.1069861908854, 'model__base_estimator__gamma': 8.693959087324277e-05, 'model__n_estimators': 96}


In [6]:
import pandas as pd

best_model = optim.best_estimator_
y_test = best_model.predict(test_data.values)
print(mean_absolute_error(best_model.predict(train_data), y))
result = pd.DataFrame(np.exp(y_test), index=test_data.index, columns=['SalePrice'], )
result.to_csv('Bagging_SVR.csv')



0.06833498654637198
