In [1]:
import pandas as pd
import numpy as np

import catboost as cb

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.exceptions import NotFittedError
from raif_hack.data_transformers import SmoothedTargetEncoding

from raif_hack.model import BenchmarkModel
from raif_hack.settings import MODEL_PARAMS, LOGGING_CONFIG, NUM_FEATURES, CATEGORICAL_OHE_FEATURES,CATEGORICAL_STE_FEATURES,TARGET
from raif_hack.utils import PriceTypeEnum
from raif_hack.metrics import metrics_stat
from raif_hack.features import prepare_categorical

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
def deviation_metric_one_sample(y_true, y_pred):
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9

In [4]:
def deviation_metric(y_true, y_pred):
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()

In [5]:
X_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_offer = train_df[train_df.price_type == PriceTypeEnum.OFFER_PRICE][TARGET]
X_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
y_manual = train_df[train_df.price_type == PriceTypeEnum.MANUAL_PRICE][TARGET]


In [45]:
MODEL_PARAMS = dict(
            n_estimators=2000,
            learning_rate=0.01,
            reg_alpha=1,
            num_leaves=40,
            min_child_samples=5,
            importance_type="gain",
            n_jobs=1,
            random_state=563,
        )

In [6]:
model = cb.CatBoostRegressor( task_type="GPU",
                           devices='0:1')

In [7]:
model.fit(X_offer, y_offer, cat_features=[67, 68, 69])

Learning rate set to 0.0867
0:	learn: 167861.0102505	total: 25.9ms	remaining: 25.8s
1:	learn: 163458.2226067	total: 48.6ms	remaining: 24.3s
2:	learn: 159691.5182340	total: 71.9ms	remaining: 23.9s
3:	learn: 156473.6402614	total: 95.3ms	remaining: 23.7s
4:	learn: 153739.3564704	total: 118ms	remaining: 23.4s
5:	learn: 151415.2287962	total: 141ms	remaining: 23.3s
6:	learn: 147417.1160428	total: 167ms	remaining: 23.7s
7:	learn: 143948.3986601	total: 189ms	remaining: 23.4s
8:	learn: 140965.0480867	total: 212ms	remaining: 23.3s
9:	learn: 138399.3409781	total: 234ms	remaining: 23.2s
10:	learn: 136162.6153661	total: 258ms	remaining: 23.2s
11:	learn: 135261.4788991	total: 278ms	remaining: 22.9s
12:	learn: 134503.5230172	total: 297ms	remaining: 22.6s
13:	learn: 132900.7896111	total: 320ms	remaining: 22.5s
14:	learn: 131470.8401840	total: 339ms	remaining: 22.2s
15:	learn: 130253.4922727	total: 356ms	remaining: 21.9s
16:	learn: 129110.9943091	total: 373ms	remaining: 21.6s
17:	learn: 127879.8712220	

151:	learn: 105385.3696631	total: 3s	remaining: 16.7s
152:	learn: 105381.7241501	total: 3.01s	remaining: 16.7s
153:	learn: 105334.5015088	total: 3.03s	remaining: 16.7s
154:	learn: 105322.1382209	total: 3.05s	remaining: 16.6s
155:	learn: 105308.4540583	total: 3.07s	remaining: 16.6s
156:	learn: 105300.8705259	total: 3.09s	remaining: 16.6s
157:	learn: 105241.0674145	total: 3.11s	remaining: 16.6s
158:	learn: 105215.5389743	total: 3.13s	remaining: 16.6s
159:	learn: 105186.6394217	total: 3.15s	remaining: 16.5s
160:	learn: 105174.4858915	total: 3.17s	remaining: 16.5s
161:	learn: 105159.6327600	total: 3.19s	remaining: 16.5s
162:	learn: 105119.8054440	total: 3.21s	remaining: 16.5s
163:	learn: 105110.4782279	total: 3.23s	remaining: 16.5s
164:	learn: 105081.2389291	total: 3.25s	remaining: 16.4s
165:	learn: 104968.5662749	total: 3.27s	remaining: 16.4s
166:	learn: 104926.7660659	total: 3.29s	remaining: 16.4s
167:	learn: 104892.7440704	total: 3.31s	remaining: 16.4s
168:	learn: 104877.8882265	total: 

301:	learn: 100897.3858623	total: 5.97s	remaining: 13.8s
302:	learn: 100869.9848020	total: 5.99s	remaining: 13.8s
303:	learn: 100867.4569571	total: 6.01s	remaining: 13.8s
304:	learn: 100835.4304916	total: 6.03s	remaining: 13.7s
305:	learn: 100805.6331198	total: 6.05s	remaining: 13.7s
306:	learn: 100782.8366816	total: 6.07s	remaining: 13.7s
307:	learn: 100780.2147365	total: 6.09s	remaining: 13.7s
308:	learn: 100737.6348600	total: 6.11s	remaining: 13.7s
309:	learn: 100708.5831221	total: 6.13s	remaining: 13.6s
310:	learn: 100680.4575874	total: 6.15s	remaining: 13.6s
311:	learn: 100668.2733884	total: 6.17s	remaining: 13.6s
312:	learn: 100661.9531017	total: 6.19s	remaining: 13.6s
313:	learn: 100622.2594101	total: 6.21s	remaining: 13.6s
314:	learn: 100613.3196298	total: 6.23s	remaining: 13.5s
315:	learn: 100549.8413046	total: 6.25s	remaining: 13.5s
316:	learn: 100514.6676491	total: 6.27s	remaining: 13.5s
317:	learn: 100502.7932163	total: 6.29s	remaining: 13.5s
318:	learn: 100477.5893523	tota

455:	learn: 97575.8488676	total: 9.39s	remaining: 11.2s
456:	learn: 97535.6490166	total: 9.42s	remaining: 11.2s
457:	learn: 97523.3568689	total: 9.45s	remaining: 11.2s
458:	learn: 97508.9132404	total: 9.47s	remaining: 11.2s
459:	learn: 97490.8369424	total: 9.5s	remaining: 11.2s
460:	learn: 97485.5359263	total: 9.53s	remaining: 11.1s
461:	learn: 97473.8526694	total: 9.55s	remaining: 11.1s
462:	learn: 97473.2374591	total: 9.57s	remaining: 11.1s
463:	learn: 97471.4768358	total: 9.59s	remaining: 11.1s
464:	learn: 97469.2610061	total: 9.61s	remaining: 11.1s
465:	learn: 97452.9233348	total: 9.64s	remaining: 11s
466:	learn: 97447.8854170	total: 9.66s	remaining: 11s
467:	learn: 97382.4739031	total: 9.69s	remaining: 11s
468:	learn: 97369.5816024	total: 9.71s	remaining: 11s
469:	learn: 97353.0218792	total: 9.73s	remaining: 11s
470:	learn: 97338.7833729	total: 9.75s	remaining: 11s
471:	learn: 97306.9833670	total: 9.78s	remaining: 10.9s
472:	learn: 97291.0894885	total: 9.8s	remaining: 10.9s
473:	l

612:	learn: 95323.7640622	total: 12.7s	remaining: 8.03s
613:	learn: 95321.0021865	total: 12.7s	remaining: 8.01s
614:	learn: 95311.7237443	total: 12.8s	remaining: 7.99s
615:	learn: 95310.6290945	total: 12.8s	remaining: 7.97s
616:	learn: 95304.9510364	total: 12.8s	remaining: 7.95s
617:	learn: 95304.3422876	total: 12.8s	remaining: 7.93s
618:	learn: 95300.9199214	total: 12.8s	remaining: 7.91s
619:	learn: 95293.8599437	total: 12.9s	remaining: 7.89s
620:	learn: 95280.5980499	total: 12.9s	remaining: 7.87s
621:	learn: 95267.0272569	total: 12.9s	remaining: 7.84s
622:	learn: 95247.3942877	total: 12.9s	remaining: 7.82s
623:	learn: 95245.2802778	total: 12.9s	remaining: 7.8s
624:	learn: 95236.5268560	total: 13s	remaining: 7.78s
625:	learn: 95234.9962043	total: 13s	remaining: 7.76s
626:	learn: 95213.1036570	total: 13s	remaining: 7.74s
627:	learn: 95198.6116674	total: 13s	remaining: 7.72s
628:	learn: 95192.2918546	total: 13s	remaining: 7.7s
629:	learn: 95187.4620877	total: 13.1s	remaining: 7.67s
630:

764:	learn: 93744.2932115	total: 15.9s	remaining: 4.88s
765:	learn: 93736.3982418	total: 15.9s	remaining: 4.86s
766:	learn: 93714.6134141	total: 15.9s	remaining: 4.84s
767:	learn: 93710.8104334	total: 15.9s	remaining: 4.82s
768:	learn: 93695.4564756	total: 16s	remaining: 4.8s
769:	learn: 93676.5402286	total: 16s	remaining: 4.78s
770:	learn: 93662.1280123	total: 16s	remaining: 4.76s
771:	learn: 93660.2957479	total: 16s	remaining: 4.74s
772:	learn: 93631.9327204	total: 16.1s	remaining: 4.72s
773:	learn: 93630.9486065	total: 16.1s	remaining: 4.7s
774:	learn: 93630.5945313	total: 16.1s	remaining: 4.67s
775:	learn: 93623.6950038	total: 16.1s	remaining: 4.65s
776:	learn: 93598.8785376	total: 16.1s	remaining: 4.63s
777:	learn: 93597.6023820	total: 16.2s	remaining: 4.61s
778:	learn: 93596.9929464	total: 16.2s	remaining: 4.59s
779:	learn: 93582.3236245	total: 16.2s	remaining: 4.57s
780:	learn: 93567.8031072	total: 16.2s	remaining: 4.55s
781:	learn: 93558.4498048	total: 16.2s	remaining: 4.53s
78

914:	learn: 92266.1298583	total: 19s	remaining: 1.77s
915:	learn: 92259.0226024	total: 19.1s	remaining: 1.75s
916:	learn: 92236.1699888	total: 19.1s	remaining: 1.73s
917:	learn: 92215.2731785	total: 19.1s	remaining: 1.71s
918:	learn: 92207.2895900	total: 19.1s	remaining: 1.69s
919:	learn: 92204.1488367	total: 19.2s	remaining: 1.67s
920:	learn: 92203.7046811	total: 19.2s	remaining: 1.64s
921:	learn: 92201.1507450	total: 19.2s	remaining: 1.62s
922:	learn: 92189.5117264	total: 19.2s	remaining: 1.6s
923:	learn: 92188.7660600	total: 19.2s	remaining: 1.58s
924:	learn: 92188.5175031	total: 19.2s	remaining: 1.56s
925:	learn: 92168.9217125	total: 19.3s	remaining: 1.54s
926:	learn: 92168.3186991	total: 19.3s	remaining: 1.52s
927:	learn: 92165.2771236	total: 19.3s	remaining: 1.5s
928:	learn: 92163.9811175	total: 19.3s	remaining: 1.48s
929:	learn: 92160.3998140	total: 19.3s	remaining: 1.46s
930:	learn: 92150.0994844	total: 19.4s	remaining: 1.44s
931:	learn: 92149.4381501	total: 19.4s	remaining: 1.

<catboost.core.CatBoostRegressor at 0x7f737c30a940>

In [9]:
 def find_corr_coefficient(model, X_manual, y_manual):
        """Вычисление корректирующего коэффициента

        :param X_manual: pd.DataFrame с ручными оценками
        :param y_manual: pd.Series - цены ручника
        """
        predictions = model.predict(X_manual)
        deviation = ((y_manual - predictions)/predictions).median()
        return deviation

In [11]:
corr_coef = find_corr_coefficient(model, X_manual, y_manual)

In [14]:
predictions_offer = model.predict(X_offer)
metrics = metrics_stat(y_offer.values, predictions_offer/(1+corr_coef))

In [15]:
predictions_manual = model.predict(X_manual)
metrics = metrics_stat(y_manual.values, predictions_manual)

In [16]:
X_sub = test_df[NUM_FEATURES+CATEGORICAL_OHE_FEATURES+CATEGORICAL_STE_FEATURES]
predictions = model.predict()
corrected_price = predictions * (1 + corr_coef)

NameError: name 'predictions' is not defined