In [7]:
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

np.random.seed(42)
X, y = torch.load('data/house_price_v2.pkl')
train_data, test_data = X.iloc[:len(y)], X.iloc[len(y):]

In [55]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_log_error, make_scorer, mean_squared_error

# baseline
scaler = StandardScaler()
svr = SVR()
pipeline = Pipeline([
    ('std', scaler),
    ('svr', svr)
])
cross_val_score(svr, train_data, y, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=10).mean()


-0.06562895798929026

In [56]:
# from ray.tune.sklearn import TuneSearchCV
from ray import tune
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from scipy.stats import loguniform

params = {
    'svr__C': loguniform(1e-3, 1e3),
    'svr__gamma': loguniform(1e-6, 10)
}
# search = TuneSearchCV(
#     pipeline,
#     params,
#     search_optimization="bayesian",
#     n_trial=-1,
#     max_iters=10,
#     early_stopping=True,
#     verbose=1,
#     scoring=make_scorer(mean_absolute_percentage_error, greater_is_better=False),
#     return_train_score=True,
#     mode='max',
#     time_budget_s=600,
#     error_score=np.nan,
#     use_gpu=True
# )
# search.fit(train_data, y)
# print(search.best_params_)
# print(search.best_score_)

In [57]:
# from ray.tune.sklearn import TuneSearchCV
from ray import tune
from sklearn.experimental import enable_halving_search_cv
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV, HalvingRandomSearchCV

grid_search = HalvingRandomSearchCV(pipeline, params, n_jobs=-1, cv=10, verbose=2,
                                    scoring=make_scorer(mean_squared_error, greater_is_better=False))
grid_search.fit(train_data, y)

n_iterations: 4
n_required_iterations: 4
n_possible_iterations: 4
min_resources_: 20
max_resources_: 1460
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 73
n_resources: 20
Fitting 10 folds for each of 73 candidates, totalling 730 fits
----------
iter: 1
n_candidates: 25
n_resources: 60
Fitting 10 folds for each of 25 candidates, totalling 250 fits
----------
iter: 2
n_candidates: 9
n_resources: 180
Fitting 10 folds for each of 9 candidates, totalling 90 fits
----------
iter: 3
n_candidates: 3
n_resources: 540
Fitting 10 folds for each of 3 candidates, totalling 30 fits


HalvingRandomSearchCV(cv=10,
                      estimator=Pipeline(steps=[('std', StandardScaler()),
                                                ('svr', SVR())]),
                      n_jobs=-1,
                      param_distributions={'svr__C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BC6864CBB0>,
                                           'svr__gamma': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001BC68674F70>},
                      scoring=make_scorer(mean_squared_error, greater_is_better=False),
                      verbose=2)

In [60]:
grid_search.best_score_, grid_search.best_params_

(-0.022462666426156682,
 {'svr__C': 11.169480627782885, 'svr__gamma': 0.00013315071186570328})

In [61]:
import pandas as pd

# best_model = tune_search.best_estimator
best_model = grid_search.best_estimator_
print(mean_squared_error(best_model.predict(train_data), y))
y_test = best_model.predict(test_data.values)
result = pd.DataFrame(np.exp(y_test), index=test_data.index, columns=['SalePrice'], )
result.to_csv('SVR.csv')

0.009527187970577425


