In [1]:
!pip install ray[tune] tune-sklearn


Collecting tune-sklearn
  Downloading tune_sklearn-0.4.3-py3-none-any.whl (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 KB[0m [31m428.6 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: tune-sklearn
Successfully installed tune-sklearn-0.4.3
[0m

In [4]:
import torch
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import StandardScaler

data, y = torch.load('data/house_price_v2.pkl')
train_data, test_data = data[:len(y)], data[len(y):]

np.random.seed(42)

In [5]:
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import make_scorer, mean_squared_log_error, mean_squared_error

std = StandardScaler()
model = DecisionTreeRegressor()
pipe = Pipeline(
    [('std', std),
     ('model', model)]
)

cross_val_score(pipe, train_data, y, cv=10, n_jobs=-1, verbose=1,
                scoring=make_scorer(mean_squared_error, greater_is_better=False)).mean()


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.6s finished


-0.03659129979773332

In [4]:
# 超参数搜索
from ray import tune
from ray.tune.sklearn import TuneSearchCV
from sklearn.metrics import mean_squared_log_error

param = {
    'model__max_depth': tune.randint(3, 500),
    'model__min_samples_split': tune.randint(2, len(train_data) + 1),
    'model__min_samples_leaf': tune.randint(1, len(train_data) + 1),
    'model__max_features': tune.randint(2, train_data.shape[1] + 1)}

tune_search = TuneSearchCV(
    pipe,
    param,
    search_optimization="bayesian",
    n_trials=-1,
    verbose=1,
    return_train_score=True,
    mode='max',
    time_budget_s=1800,
    error_score=np.nan,
    scoring=make_scorer(mean_squared_log_error, greater_is_better=False),
    use_gpu=False,
    cv=10
)
tune_search.fit(train_data, y)
print(tune_search.best_params_)

2022-05-18 04:20:26,036	INFO tune.py:702 -- Total run time: 1810.31 seconds (1808.93 seconds for the tuning loop).


{'model__max_depth': 377, 'model__min_samples_split': 2, 'model__min_samples_leaf': 19, 'model__max_features': 158}


In [7]:
pipe.set_params(**{'model__max_depth': 377, 'model__min_samples_split': 2, 'model__min_samples_leaf': 19,
                   'model__max_features': 158})
pipe.fit(train_data, y)
mean_squared_error(pipe.predict(train_data), y)


0.023134470588509683

In [5]:
best_model = tune_search.best_estimator
best_model.score(train_data, y)

0.8577228771048372

In [6]:
import pandas as pd

best_model = tune_search.best_estimator
y_test = best_model.predict(test_data.values)
result = pd.DataFrame(np.exp(y_test), index=test_data.index, columns=['SalePrice'], )
result.to_csv('tree.csv')
torch.save(tune_search.best_params_, 'tree.pkl')

  "X does not have valid feature names, but"
