#  Домашняя работа¶
Взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество)



In [1]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor, cv, Pool
import optuna
import gc
import numpy as np

SEED = 42

In [2]:
dataset = load_boston()
X = dataset.data
y = dataset.target

In [3]:
X.shape, y.shape

((506, 13), (506,))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=10, random_state=SEED)

In [5]:
k_fold = 10

### Считаем базовый скор

In [6]:
model = RandomForestRegressor()
cross_val_score(model, X_train, y_train, cv=k_fold, scoring='r2', ).mean()

0.8765021335752474

In [7]:
model = LinearRegression()
cross_val_score(model, X_train, y_train, cv=k_fold, scoring='r2').mean()

0.7121876358951333

### Optuna

In [8]:
def objective(trial):
    
    regressor = trial.suggest_categorical('model', ['RF', 'GBR', 'cat'])
    
    if regressor == "RF":
        # n_estimators = trial.suggest_int('n_estimators', 170, 200)
        n_estimators = 200
        max_depth = trial.suggest_int('max_depth', 8, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        regressor_obj = RandomForestRegressor(n_estimators=n_estimators,
                                              max_depth=max_depth,
                                              min_samples_split=min_samples_split,
                                              min_samples_leaf=min_samples_leaf,
                                              )
        
    elif regressor == "GBR":
        loss = trial.suggest_categorical('loss', ['ls', 'lad', 'huber', 'quantile'])
        learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 1)
        n_estimators = 200
        criterion = trial.suggest_categorical('criterion', ['friedman_mse', 'mse', 'mae'])
        regressor_obj = GradientBoostingRegressor(loss=loss,
                                                  learning_rate=learning_rate,
                                                  n_estimators=n_estimators,
                                                  criterion=criterion,
                                                 )
    elif regressor == "cat":
        params = {"iterations": 50,
                  "depth": trial.suggest_int('depth', 4, 12),
                  "loss_function": trial.suggest_categorical("loss_function", ["RMSE", "MAE", "MAPE"]),
                  "learning_rate": trial.suggest_loguniform('learning_rate', 0.1, 0.5),
                  "verbose": False,
                  'eval_metric': 'R2'}

        score = cv(Pool(X_train, y_train),
                    params,
                    fold_count=10, 
                    early_stopping_rounds=2)['test-R2-mean'].values[-1]
        gc.collect()
        return score
    gc.collect()
    score = cross_val_score(regressor_obj, X_train, y_train, cv=k_fold, scoring='r2').mean()
    return score


In [None]:
%%time
study = optuna.create_study(direction="maximize") 
study.optimize(objective, n_trials=200, n_jobs=-1, show_progress_bar=True);

In [12]:
optuna.visualization.plot_optimization_history(study)

In [13]:
study.best_params

{'model': 'GBR',
 'loss': 'ls',
 'learning_rate': 0.13896115733294043,
 'criterion': 'friedman_mse'}

### Проверка на тестовой выборке

In [17]:
loss = 'ls'
learning_rate = 0.14
n_estimators = 200
criterion = 'friedman_mse'
regressor = GradientBoostingRegressor(loss=loss,
                                    learning_rate=learning_rate,
                                    n_estimators=n_estimators,
                                    criterion=criterion,
                                    )

cross_val_score(regressor, X_train, y_train, cv=k_fold, scoring='r2').mean()

0.9006745191552146

In [20]:
regressor.fit(X_train, y_train)

regressor.score(X_test, y_test)

0.9229820010785151

На тестовой выборке достаточно неплохой результат

При этом выбраны очень узкие пространства параметров для оптизизации

Увеличив диапазоны параметров возможно повысить качество обучения модели, но при этом значительно увеличивается время оптимизации