## Модули

In [91]:
import pandas as pd
import seaborn as sns
import optuna
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error as mse
from matplotlib import pyplot as plt
from sklearn.metrics import r2_score

## Постановка задачи

Мы предскажем стоимость подержанного авто по его описанию. Но теперь подойдём к задаче, вооружившись более сильным методом - градиентным бустингом. Снова предположим, что вы работаете в компании, которая занимается размещением объявлений. Но в этот раз вы собрали выборку объявлений, размещённых в США, и должны предсказать цену в долларах. 

В качестве метрики качества у нас будет $RMSE$, и нам необходимо получить модель c $RMSE < 7550$

## Загрузка данных 

Данные уже подготовлены:
- удалены аномалии в таргете (замена 99 перцентилем)
- пропуски заполнены наиболее популярными значениями
- категориальные признаки не тронуты

In [143]:
df = pd.read_csv('car_pricing_preprocessed.csv')

Разделим данные на трейн и тест

In [144]:
X = df.drop('target', 1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42
)

  X = df.drop('target', 1)


In [94]:
encoding = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train = encoding.fit_transform(X_train)
X_test = encoding.transform(X_test)

Неплохим начальным приближением для обучения модели может быть запуск алгоритма без передачи в него параметров

In [135]:
cbr = CatBoostRegressor()

cbr.fit(X_train, y_train)
print(mse(y_test, cbr.predict(X_test)) ** .5)
print(r2_score(y_train, cbr.predict(X_train)))

7891.952156022228
0.8203702896524507


### Далее поработайте с optuna:
- создайте сетку из основных гиперпараметров для алгоритма
- дополните функцию objective проверкой качества модели на тестовой выборке
- не забудьте, что objective должна возвращать итоговое качество модели

И запустите оптимизатор

In [140]:
def objective(trial):

    param = {
        "depth": trial.suggest_int("depth", 5, 8),
        ###
    }

    model = ###

    accuracy = ###

    return accuracy

In [141]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=500, timeout=600, n_jobs=-1)

[32m[I 2022-08-22 14:51:08,005][0m A new study created in memory with name: no-name-332d3b59-f252-4522-9cab-9a8886c6f2c6[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.Custom logger is already specified. Specify more than one logger at same time is not thread safe.

[32m[I 2022-08-22 14:52:03,959][0m Trial 25 finished with value: 7520.746103037355 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2897, 'learning_rate': 0.07291719116567735}. Best is trial 15 with value: 7476.929221375097.[0m
[32m[I 2022-08-22 14:52:07,396][0m Trial 29 finished with value: 7618.350569675734 and parameters: {'objective': 'RMSE', 'depth': 7, 'n_estimators': 2289, 'learning_rate': 0.06850753120327935}. Best is trial 15 with value: 7476.929221375097.[0m
[32m[I 2022-08-22 14:52:09,484][0m Trial 28 finished with value: 7625.325952200094 and parameters: {'objective': 'RMSE', 'depth': 7, 'n_estimators': 2996, 'learning_rate': 0.06525382119446618}. Best is trial 15 with value: 7476.929221375097.[0m
[32m[I 2022-08-22 14:52:10,823][0m Trial 30 finished with value: 7630.66893306071 and parameters: {'objective': 'RMSE', 'depth': 7, 'n_estimators': 2445, 'learning_rate': 0.059841261832796484}. Best is trial 15 with value: 7476.929221375097.[0m
[32m[I 

[32m[I 2022-08-22 14:53:14,826][0m Trial 58 finished with value: 7864.02483661008 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2047, 'learning_rate': 0.16453822326793485}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:53:21,971][0m Trial 59 finished with value: 7625.715021720525 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2095, 'learning_rate': 0.1781434424897509}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:53:23,763][0m Trial 60 finished with value: 7598.210341005203 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2121, 'learning_rate': 0.18776567207351919}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:53:23,943][0m Trial 61 finished with value: 7697.206667509515 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2130, 'learning_rate': 0.15857044233934914}. Best is trial 45 with value: 7372.664077418726.[0m
Custom log

[32m[I 2022-08-22 14:54:17,853][0m Trial 89 finished with value: 7587.703519519214 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2483, 'learning_rate': 0.04861034877597294}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:54:30,069][0m Trial 90 finished with value: 7450.08652754123 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2449, 'learning_rate': 0.049378709595340076}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:54:31,410][0m Trial 91 finished with value: 7550.718928527533 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2469, 'learning_rate': 0.04012177473162759}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 2022-08-22 14:54:33,250][0m Trial 93 finished with value: 7587.5583102272985 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2441, 'learning_rate': 0.0496134828370788}. Best is trial 45 with value: 7372.664077418726.[0m
[32m[I 

[32m[I 2022-08-22 14:55:48,125][0m Trial 122 finished with value: 7557.83074834727 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2542, 'learning_rate': 0.054345637414041026}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:55:51,187][0m Trial 123 finished with value: 7495.748005918362 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2537, 'learning_rate': 0.05493425733540716}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:55:53,215][0m Trial 125 finished with value: 7503.192510496138 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2675, 'learning_rate': 0.09223869397108671}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:55:53,587][0m Trial 126 finished with value: 7439.266999250298 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2640, 'learning_rate': 0.06836699731579357}. Best is trial 110 with value: 7369.925521565112.[0m


[32m[I 2022-08-22 14:56:59,065][0m Trial 157 finished with value: 7639.056100458166 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 1393, 'learning_rate': 0.05146135938941713}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:57:02,070][0m Trial 151 finished with value: 7404.263266402815 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2833, 'learning_rate': 0.047116592796634646}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:57:05,134][0m Trial 153 finished with value: 7511.717794602177 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2867, 'learning_rate': 0.03959899129789085}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:57:05,355][0m Trial 152 finished with value: 7455.90855919724 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2876, 'learning_rate': 0.03891585646334435}. Best is trial 110 with value: 7369.925521565112.[0m


[32m[I 2022-08-22 14:58:21,945][0m Trial 187 finished with value: 7536.9330349497795 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2593, 'learning_rate': 0.0577962367090374}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:58:22,796][0m Trial 188 finished with value: 7491.396729828956 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2592, 'learning_rate': 0.037846238826091394}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:58:26,955][0m Trial 189 finished with value: 7514.5174506559515 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2601, 'learning_rate': 0.044326069565152657}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:58:28,038][0m Trial 190 finished with value: 7484.861208055789 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2604, 'learning_rate': 0.037722060173226026}. Best is trial 110 with value: 7369.925521565112.

[32m[I 2022-08-22 14:59:50,800][0m Trial 220 finished with value: 7488.754357556243 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2859, 'learning_rate': 0.04209908358075723}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:59:51,389][0m Trial 219 finished with value: 7459.9289095361255 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2905, 'learning_rate': 0.04688255059458876}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 14:59:59,328][0m Trial 221 finished with value: 7525.640121017343 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2993, 'learning_rate': 0.06630176514111938}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 15:00:01,181][0m Trial 222 finished with value: 7523.404149279698 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2971, 'learning_rate': 0.06554958666182314}. Best is trial 110 with value: 7369.925521565112.[0m

[32m[I 2022-08-22 15:01:11,333][0m Trial 250 finished with value: 7514.399386608373 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2797, 'learning_rate': 0.050750664818474026}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 15:01:12,941][0m Trial 253 finished with value: 7490.434743745271 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2513, 'learning_rate': 0.05350620318894378}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 15:01:13,817][0m Trial 252 finished with value: 7502.527630605924 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2784, 'learning_rate': 0.05278843500480975}. Best is trial 110 with value: 7369.925521565112.[0m
[32m[I 2022-08-22 15:01:14,948][0m Trial 254 finished with value: 7496.46694936107 and parameters: {'objective': 'RMSE', 'depth': 8, 'n_estimators': 2786, 'learning_rate': 0.05490239685097793}. Best is trial 110 with value: 7369.925521565112.[0m


CPU times: user 1h 31min 49s, sys: 1min 31s, total: 1h 33min 20s
Wall time: 10min 13s


### Теперь попробуйте улучшить результат самостоятельно, для этого вы можете пересмотреть подход работы с категориальными признаками

Домашнюю работу считаем принятой, если вы получите результат на тесте ниже 7300