In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
# нужные библиотеки
import numpy as np
import pandas as pd
import random
import numpy as np

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from preprocessing import preprocess_df

import lightgbm as lgbm

pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

In [4]:
THRESHOLD = 0.15
NEGATIVE_WEIGHT = 1.1

def deviation_metric_one_sample(y_true: typing.Union[float, int], y_pred: typing.Union[float, int]) -> float:
    """
    Реализация кастомной метрики для хакатона.

    :param y_true: float, реальная цена
    :param y_pred: float, предсказанная цена
    :return: float, значение метрики
    """
    deviation = (y_pred - y_true) / np.maximum(1e-8, y_true)
    if np.abs(deviation) <= THRESHOLD:
        return 0
    elif deviation <= - 4 * THRESHOLD:
        return 9 * NEGATIVE_WEIGHT
    elif deviation < -THRESHOLD:
        return NEGATIVE_WEIGHT * ((deviation / THRESHOLD) + 1) ** 2
    elif deviation < 4 * THRESHOLD:
        return ((deviation / THRESHOLD) - 1) ** 2
    else:
        return 9


def deviation_metric(y_true: np.array, y_pred: np.array) -> float:
    return np.array([deviation_metric_one_sample(y_true[n], y_pred[n]) for n in range(len(y_true))]).mean()


In [20]:
# загрузка данных
train = pd.read_csv('../../../Данные хакатона Raifhack DS/data/train.csv')
test = pd.read_csv('../../../Данные хакатона Raifhack DS/data/test.csv')

test_submission = pd.read_csv('../../../Данные хакатона Raifhack DS/data/test_submission.csv', low_memory=False)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [21]:
train, target = preprocess_df(train)
test, _= preprocess_df(test, use_target=False)

In [19]:
# подбор параметров для LightGBM
import optuna
from optuna.samplers import TPESampler
sampler = TPESampler(seed=13)

def create_model(trial):
    num_leaves = trial.suggest_int("num_leaves", 2, 1500)
    n_estimators = trial.suggest_int("n_estimators", 10, 2000)
    max_depth = trial.suggest_int('max_depth', 2, 25)
    min_child_samples = trial.suggest_int('min_child_samples', 2, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.00001, 0.99)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 2, 300)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.00001, 1.0)
    
    model = lgbm.LGBMRegressor(
        num_leaves=num_leaves,
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        learning_rate=learning_rate,
        feature_fraction=feature_fraction,
        random_state=13,
        n_jobs=-1,
        objective='regression_l1'
)
    return model

def objective(trial):
    model = create_model(trial)
    X_train, X_test, y_train, y_test = train_test_split(train, target, random_state=random.randint(1, 10000))
    model.fit(X_train, y_train)
    result = model.predict(X_test)
    score = deviation_metric(np.exp(y_test), np.exp(result))
    return score

In [20]:
study = optuna.create_study(direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=100)
params_lgbm = study.best_params

[32m[I 2021-09-25 21:49:15,031][0m A new study created in memory with name: no-name-0318f32f-1e2e-4934-98b9-d819afcb8107[0m
[32m[I 2021-09-25 21:49:15,722][0m Trial 0 finished with value: 1.8648288530003048 and parameters: {'num_leaves': 1167, 'n_estimators': 482, 'max_depth': 21, 'min_child_samples': 290, 'learning_rate': 0.9628753767547054, 'min_data_in_leaf': 137, 'feature_fraction': 0.6090463723366503}. Best is trial 0 with value: 1.8648288530003048.[0m




[32m[I 2021-09-25 21:49:23,809][0m Trial 1 finished with value: 1.279002387915969 and parameters: {'num_leaves': 1164, 'n_estimators': 1287, 'max_depth': 19, 'min_child_samples': 12, 'learning_rate': 0.29547199168557875, 'min_data_in_leaf': 19, 'feature_fraction': 0.857062371977773}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:25,570][0m Trial 2 finished with value: 1.5856707639814585 and parameters: {'num_leaves': 560, 'n_estimators': 1363, 'max_depth': 8, 'min_child_samples': 105, 'learning_rate': 0.009328548252458909, 'min_data_in_leaf': 109, 'feature_fraction': 0.9490946907363935}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:26,124][0m Trial 3 finished with value: 1.4937792854949226 and parameters: {'num_leaves': 328, 'n_estimators': 645, 'max_depth': 24, 'min_child_samples': 11, 'learning_rate': 0.06444304082671892, 'min_data_in_leaf': 190, 'feature_fraction': 0.8738147051451058}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:27,425][0m Trial 4 finished with value: 1.5980713878620614 and parameters: {'num_leaves': 15, 'n_estimators': 1496, 'max_depth': 21, 'min_child_samples': 24, 'learning_rate': 0.6498942166976422, 'min_data_in_leaf': 154, 'feature_fraction': 0.4798885926624682}. Best is trial 1 with value: 1.279002387915969.[0m
[32m[I 2021-09-25 21:49:27,464][0m Trial 5 finished with value: 1.5536610217065976 and parameters: {'num_leaves': 1434, 'n_estimators': 10, 'max_depth': 7, 'min_child_samples': 214, 'learning_rate': 0.32134298346834816, 'min_data_in_leaf': 84, 'feature_fraction': 0.695448498098949}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:27,991][0m Trial 6 finished with value: 1.462981257773771 and parameters: {'num_leaves': 1378, 'n_estimators': 496, 'max_depth': 12, 'min_child_samples': 77, 'learning_rate': 0.3755461652353207, 'min_data_in_leaf': 182, 'feature_fraction': 0.7723810358929613}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:30,102][0m Trial 7 finished with value: 1.3668622258674148 and parameters: {'num_leaves': 103, 'n_estimators': 1375, 'max_depth': 15, 'min_child_samples': 43, 'learning_rate': 0.09777469948067719, 'min_data_in_leaf': 75, 'feature_fraction': 0.15179514471784677}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:49:32,162][0m Trial 8 finished with value: 1.3960896138954626 and parameters: {'num_leaves': 1390, 'n_estimators': 1364, 'max_depth': 7, 'min_child_samples': 172, 'learning_rate': 0.5510701645995443, 'min_data_in_leaf': 23, 'feature_fraction': 0.8397101133309667}. Best is trial 1 with value: 1.279002387915969.[0m
[32m[I 2021-09-25 21:49:32,363][0m Trial 9 finished with value: 1.6330418268942806 and parameters: {'num_leaves': 609, 'n_estimators': 298, 'max_depth': 6, 'min_child_samples': 148, 'learning_rate': 0.7049070102635132, 'min_data_in_leaf': 296, 'feature_fraction': 0.8747877538700272}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:03,214][0m Trial 10 finished with value: 1.457547416319019 and parameters: {'num_leaves': 977, 'n_estimators': 1924, 'max_depth': 16, 'min_child_samples': 238, 'learning_rate': 0.25705954100791883, 'min_data_in_leaf': 2, 'feature_fraction': 0.3819815987191735}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:04,646][0m Trial 11 finished with value: 1.53593207838843 and parameters: {'num_leaves': 868, 'n_estimators': 1041, 'max_depth': 15, 'min_child_samples': 65, 'learning_rate': 0.17784248646849737, 'min_data_in_leaf': 57, 'feature_fraction': 0.06642147817702584}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:06,403][0m Trial 12 finished with value: 1.3506296391153476 and parameters: {'num_leaves': 101, 'n_estimators': 978, 'max_depth': 18, 'min_child_samples': 8, 'learning_rate': 0.1608641645919679, 'min_data_in_leaf': 51, 'feature_fraction': 0.19474660479899453}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:09,836][0m Trial 13 finished with value: 1.3955863669254625 and parameters: {'num_leaves': 1102, 'n_estimators': 949, 'max_depth': 19, 'min_child_samples': 106, 'learning_rate': 0.4212408580200607, 'min_data_in_leaf': 35, 'feature_fraction': 0.27362337037810347}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:10,529][0m Trial 14 finished with value: 1.5047190502484427 and parameters: {'num_leaves': 651, 'n_estimators': 962, 'max_depth': 11, 'min_child_samples': 21, 'learning_rate': 0.2314911356158592, 'min_data_in_leaf': 238, 'feature_fraction': 0.2725724774800815}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:11,647][0m Trial 15 finished with value: 1.417868560125277 and parameters: {'num_leaves': 407, 'n_estimators': 1819, 'max_depth': 2, 'min_child_samples': 7, 'learning_rate': 0.5044838005056169, 'min_data_in_leaf': 48, 'feature_fraction': 0.5503536985299209}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:20,630][0m Trial 16 finished with value: 1.363145776039315 and parameters: {'num_leaves': 250, 'n_estimators': 803, 'max_depth': 25, 'min_child_samples': 70, 'learning_rate': 0.15556912056294556, 'min_data_in_leaf': 4, 'feature_fraction': 0.42015067357857266}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:22,967][0m Trial 17 finished with value: 1.4476298508953191 and parameters: {'num_leaves': 811, 'n_estimators': 1650, 'max_depth': 19, 'min_child_samples': 126, 'learning_rate': 0.3616859043064977, 'min_data_in_leaf': 85, 'feature_fraction': 0.20157457580993482}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:23,829][0m Trial 18 finished with value: 1.8878191713581394 and parameters: {'num_leaves': 1208, 'n_estimators': 1138, 'max_depth': 17, 'min_child_samples': 49, 'learning_rate': 0.928751204683859, 'min_data_in_leaf': 120, 'feature_fraction': 0.03862669142154326}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:27,275][0m Trial 19 finished with value: 1.3209863915084088 and parameters: {'num_leaves': 951, 'n_estimators': 1180, 'max_depth': 22, 'min_child_samples': 178, 'learning_rate': 0.23825967254026786, 'min_data_in_leaf': 41, 'feature_fraction': 0.6366830027790831}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:29,056][0m Trial 20 finished with value: 1.5900565552775827 and parameters: {'num_leaves': 1009, 'n_estimators': 1187, 'max_depth': 22, 'min_child_samples': 173, 'learning_rate': 0.6147800524797495, 'min_data_in_leaf': 101, 'feature_fraction': 0.6901342343327235}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:31,534][0m Trial 21 finished with value: 1.4157879236690492 and parameters: {'num_leaves': 1279, 'n_estimators': 789, 'max_depth': 19, 'min_child_samples': 228, 'learning_rate': 0.2671721797330473, 'min_data_in_leaf': 51, 'feature_fraction': 0.9675638304963605}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:37,746][0m Trial 22 finished with value: 1.4440174141386066 and parameters: {'num_leaves': 947, 'n_estimators': 1215, 'max_depth': 18, 'min_child_samples': 178, 'learning_rate': 0.15603401565172265, 'min_data_in_leaf': 23, 'feature_fraction': 0.6138302297193262}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:41,669][0m Trial 23 finished with value: 1.471257745560183 and parameters: {'num_leaves': 718, 'n_estimators': 1647, 'max_depth': 23, 'min_child_samples': 95, 'learning_rate': 0.44561901126913617, 'min_data_in_leaf': 61, 'feature_fraction': 0.7777413126936814}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:44,955][0m Trial 24 finished with value: 1.4222723468436307 and parameters: {'num_leaves': 809, 'n_estimators': 812, 'max_depth': 21, 'min_child_samples': 273, 'learning_rate': 0.28414804178516034, 'min_data_in_leaf': 20, 'feature_fraction': 0.36073961747547967}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:50,247][0m Trial 25 finished with value: 1.4047623130308087 and parameters: {'num_leaves': 1067, 'n_estimators': 1489, 'max_depth': 20, 'min_child_samples': 198, 'learning_rate': 0.008785598003291845, 'min_data_in_leaf': 38, 'feature_fraction': 0.5163440281347065}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:52,061][0m Trial 26 finished with value: 1.362706938662278 and parameters: {'num_leaves': 492, 'n_estimators': 1117, 'max_depth': 13, 'min_child_samples': 141, 'learning_rate': 0.19653070331819408, 'min_data_in_leaf': 73, 'feature_fraction': 0.7143272358429812}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:54,122][0m Trial 27 finished with value: 1.3186777562483114 and parameters: {'num_leaves': 1264, 'n_estimators': 1233, 'max_depth': 23, 'min_child_samples': 122, 'learning_rate': 0.10234004276601344, 'min_data_in_leaf': 98, 'feature_fraction': 0.6065673700069292}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:56,109][0m Trial 28 finished with value: 1.5044494152152745 and parameters: {'num_leaves': 1282, 'n_estimators': 1599, 'max_depth': 23, 'min_child_samples': 125, 'learning_rate': 0.10636388666464669, 'min_data_in_leaf': 141, 'feature_fraction': 0.6098915387402182}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:50:58,129][0m Trial 29 finished with value: 1.342949852297675 and parameters: {'num_leaves': 1233, 'n_estimators': 1276, 'max_depth': 25, 'min_child_samples': 262, 'learning_rate': 0.33588589525376306, 'min_data_in_leaf': 122, 'feature_fraction': 0.6336039502334289}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:00,683][0m Trial 30 finished with value: 1.6201140101445608 and parameters: {'num_leaves': 1150, 'n_estimators': 1473, 'max_depth': 22, 'min_child_samples': 194, 'learning_rate': 0.8265249095930645, 'min_data_in_leaf': 94, 'feature_fraction': 0.8005793569838318}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:02,494][0m Trial 31 finished with value: 1.4569552458142971 and parameters: {'num_leaves': 1261, 'n_estimators': 1300, 'max_depth': 24, 'min_child_samples': 287, 'learning_rate': 0.32476220780446413, 'min_data_in_leaf': 121, 'feature_fraction': 0.6523613996591063}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:03,792][0m Trial 32 finished with value: 1.5771780076968844 and parameters: {'num_leaves': 1176, 'n_estimators': 1258, 'max_depth': 25, 'min_child_samples': 269, 'learning_rate': 0.4175239111979021, 'min_data_in_leaf': 161, 'feature_fraction': 0.547751904744258}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:05,537][0m Trial 33 finished with value: 1.3730428473899576 and parameters: {'num_leaves': 1355, 'n_estimators': 1082, 'max_depth': 23, 'min_child_samples': 256, 'learning_rate': 0.05371173314372063, 'min_data_in_leaf': 123, 'feature_fraction': 0.898140877026526}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:06,609][0m Trial 34 finished with value: 1.537964360579399 and parameters: {'num_leaves': 1466, 'n_estimators': 1357, 'max_depth': 25, 'min_child_samples': 298, 'learning_rate': 0.2330991013311963, 'min_data_in_leaf': 220, 'feature_fraction': 0.45290031107632034}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:09,081][0m Trial 35 finished with value: 1.3576546735405952 and parameters: {'num_leaves': 1072, 'n_estimators': 1810, 'max_depth': 21, 'min_child_samples': 234, 'learning_rate': 0.10232591295061044, 'min_data_in_leaf': 166, 'feature_fraction': 0.7498493861282296}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:10,363][0m Trial 36 finished with value: 1.506199890451697 and parameters: {'num_leaves': 882, 'n_estimators': 890, 'max_depth': 22, 'min_child_samples': 95, 'learning_rate': 0.32576621577243003, 'min_data_in_leaf': 112, 'feature_fraction': 0.6579512097862885}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:12,168][0m Trial 37 finished with value: 1.5191810668456958 and parameters: {'num_leaves': 1207, 'n_estimators': 1439, 'max_depth': 24, 'min_child_samples': 248, 'learning_rate': 0.47196343468995117, 'min_data_in_leaf': 137, 'feature_fraction': 0.5663568220467351}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:13,874][0m Trial 38 finished with value: 1.387634520109774 and parameters: {'num_leaves': 1332, 'n_estimators': 625, 'max_depth': 20, 'min_child_samples': 214, 'learning_rate': 0.36902453215527775, 'min_data_in_leaf': 73, 'feature_fraction': 0.9032142144588594}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:16,500][0m Trial 39 finished with value: 1.7052538370812167 and parameters: {'num_leaves': 1465, 'n_estimators': 1557, 'max_depth': 23, 'min_child_samples': 41, 'learning_rate': 0.061615246914422134, 'min_data_in_leaf': 188, 'feature_fraction': 0.9996195844111103}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:19,508][0m Trial 40 finished with value: 1.3807547809136957 and parameters: {'num_leaves': 1054, 'n_estimators': 1272, 'max_depth': 20, 'min_child_samples': 161, 'learning_rate': 0.29425149282190005, 'min_data_in_leaf': 88, 'feature_fraction': 0.8179581369120846}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:23,101][0m Trial 41 finished with value: 1.3609572058591255 and parameters: {'num_leaves': 102, 'n_estimators': 1059, 'max_depth': 17, 'min_child_samples': 24, 'learning_rate': 0.136792941861556, 'min_data_in_leaf': 34, 'feature_fraction': 0.7303813080885686}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:24,379][0m Trial 42 finished with value: 1.3723821258472466 and parameters: {'num_leaves': 942, 'n_estimators': 639, 'max_depth': 10, 'min_child_samples': 5, 'learning_rate': 0.20856152418317916, 'min_data_in_leaf': 65, 'feature_fraction': 0.4716746329011525}. Best is trial 1 with value: 1.279002387915969.[0m




[32m[I 2021-09-25 21:51:33,372][0m Trial 43 finished with value: 1.2255143151507202 and parameters: {'num_leaves': 1149, 'n_estimators': 1161, 'max_depth': 15, 'min_child_samples': 50, 'learning_rate': 0.04185199859407067, 'min_data_in_leaf': 14, 'feature_fraction': 0.6494305066563237}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:51:49,057][0m Trial 44 finished with value: 1.3264428170093001 and parameters: {'num_leaves': 1132, 'n_estimators': 1370, 'max_depth': 14, 'min_child_samples': 60, 'learning_rate': 0.039063050415408, 'min_data_in_leaf': 9, 'feature_fraction': 0.6596739426688524}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:51:58,204][0m Trial 45 finished with value: 1.3758943178859862 and parameters: {'num_leaves': 1143, 'n_estimators': 1384, 'max_depth': 14, 'min_child_samples': 57, 'learning_rate': 0.0348845379464207, 'min_data_in_leaf': 15, 'feature_fraction': 0.6846231013881594}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:18,387][0m Trial 46 finished with value: 1.5096089904325096 and parameters: {'num_leaves': 988, 'n_estimators': 1184, 'max_depth': 15, 'min_child_samples': 32, 'learning_rate': 0.00660666402381449, 'min_data_in_leaf': 3, 'feature_fraction': 0.8529130911181614}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:19,240][0m Trial 47 finished with value: 1.3577748749673861 and parameters: {'num_leaves': 1119, 'n_estimators': 240, 'max_depth': 13, 'min_child_samples': 83, 'learning_rate': 0.09570460056055191, 'min_data_in_leaf': 35, 'feature_fraction': 0.5879897170853902}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:25,428][0m Trial 48 finished with value: 1.2964409303071112 and parameters: {'num_leaves': 1395, 'n_estimators': 1758, 'max_depth': 10, 'min_child_samples': 112, 'learning_rate': 0.10382990970252155, 'min_data_in_leaf': 13, 'feature_fraction': 0.7633241129482708}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:30,272][0m Trial 49 finished with value: 1.2828958847797474 and parameters: {'num_leaves': 1378, 'n_estimators': 1887, 'max_depth': 10, 'min_child_samples': 130, 'learning_rate': 0.13281228895514802, 'min_data_in_leaf': 28, 'feature_fraction': 0.7495347086626277}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:35,040][0m Trial 50 finished with value: 1.277217139582604 and parameters: {'num_leaves': 1423, 'n_estimators': 1930, 'max_depth': 9, 'min_child_samples': 125, 'learning_rate': 0.1267005476215353, 'min_data_in_leaf': 25, 'feature_fraction': 0.9288287640506715}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:39,786][0m Trial 51 finished with value: 1.2478460947440504 and parameters: {'num_leaves': 1410, 'n_estimators': 1953, 'max_depth': 9, 'min_child_samples': 125, 'learning_rate': 0.12435709133347866, 'min_data_in_leaf': 23, 'feature_fraction': 0.9360930765440548}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:43,632][0m Trial 52 finished with value: 1.3382092033681134 and parameters: {'num_leaves': 1406, 'n_estimators': 1999, 'max_depth': 9, 'min_child_samples': 138, 'learning_rate': 0.13085538841425712, 'min_data_in_leaf': 28, 'feature_fraction': 0.9362329884299375}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:45,697][0m Trial 53 finished with value: 1.3549906411498738 and parameters: {'num_leaves': 1480, 'n_estimators': 1778, 'max_depth': 6, 'min_child_samples': 114, 'learning_rate': 0.07407874284490626, 'min_data_in_leaf': 17, 'feature_fraction': 0.9302085601304618}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:49,358][0m Trial 54 finished with value: 1.2384535609299803 and parameters: {'num_leaves': 1343, 'n_estimators': 1912, 'max_depth': 8, 'min_child_samples': 158, 'learning_rate': 0.19615690300871735, 'min_data_in_leaf': 45, 'feature_fraction': 0.8536736803296285}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:50,859][0m Trial 55 finished with value: 1.4784131570166597 and parameters: {'num_leaves': 1328, 'n_estimators': 1993, 'max_depth': 4, 'min_child_samples': 158, 'learning_rate': 0.18681004427702738, 'min_data_in_leaf': 51, 'feature_fraction': 0.838170061222302}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:52:54,934][0m Trial 56 finished with value: 1.4132236356840813 and parameters: {'num_leaves': 1420, 'n_estimators': 1888, 'max_depth': 8, 'min_child_samples': 140, 'learning_rate': 0.16751693244301977, 'min_data_in_leaf': 26, 'feature_fraction': 0.883582370331327}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:00,431][0m Trial 57 finished with value: 1.3314579463677079 and parameters: {'num_leaves': 1496, 'n_estimators': 1718, 'max_depth': 11, 'min_child_samples': 91, 'learning_rate': 0.22231200681556396, 'min_data_in_leaf': 45, 'feature_fraction': 0.958371581799182}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:01,794][0m Trial 58 finished with value: 1.4230627904426478 and parameters: {'num_leaves': 1332, 'n_estimators': 1903, 'max_depth': 8, 'min_child_samples': 75, 'learning_rate': 0.1419888359359698, 'min_data_in_leaf': 286, 'feature_fraction': 0.9980283416596186}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:03,954][0m Trial 59 finished with value: 1.4208459831323748 and parameters: {'num_leaves': 1369, 'n_estimators': 1906, 'max_depth': 6, 'min_child_samples': 151, 'learning_rate': 0.27430514488412083, 'min_data_in_leaf': 63, 'feature_fraction': 0.8075854972943961}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:06,854][0m Trial 60 finished with value: 1.523868739651436 and parameters: {'num_leaves': 1294, 'n_estimators': 1714, 'max_depth': 7, 'min_child_samples': 17, 'learning_rate': 0.5374266550199853, 'min_data_in_leaf': 27, 'feature_fraction': 0.8739799724309208}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:13,030][0m Trial 61 finished with value: 1.3246596933098111 and parameters: {'num_leaves': 1375, 'n_estimators': 1844, 'max_depth': 10, 'min_child_samples': 108, 'learning_rate': 0.12438983539236498, 'min_data_in_leaf': 12, 'feature_fraction': 0.8453490606100437}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:27,986][0m Trial 62 finished with value: 1.2725825593850995 and parameters: {'num_leaves': 1427, 'n_estimators': 1757, 'max_depth': 12, 'min_child_samples': 135, 'learning_rate': 0.0793167681532989, 'min_data_in_leaf': 3, 'feature_fraction': 0.7736571424417895}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:53:45,861][0m Trial 63 finished with value: 1.2663670721370317 and parameters: {'num_leaves': 1425, 'n_estimators': 1963, 'max_depth': 12, 'min_child_samples': 129, 'learning_rate': 0.06387924021449339, 'min_data_in_leaf': 5, 'feature_fraction': 0.919995621046176}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:11,648][0m Trial 64 finished with value: 1.3684479762999806 and parameters: {'num_leaves': 1429, 'n_estimators': 1957, 'max_depth': 12, 'min_child_samples': 189, 'learning_rate': 0.03322996924576549, 'min_data_in_leaf': 2, 'feature_fraction': 0.9240425286303147}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:17,074][0m Trial 65 finished with value: 1.3383056002945268 and parameters: {'num_leaves': 1219, 'n_estimators': 1682, 'max_depth': 12, 'min_child_samples': 164, 'learning_rate': 0.07808240162454624, 'min_data_in_leaf': 40, 'feature_fraction': 0.9672051610331945}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:30,307][0m Trial 66 finished with value: 1.2743321946326613 and parameters: {'num_leaves': 1443, 'n_estimators': 1565, 'max_depth': 16, 'min_child_samples': 120, 'learning_rate': 0.18147306495439214, 'min_data_in_leaf': 18, 'feature_fraction': 0.9095210127765246}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:40,506][0m Trial 67 finished with value: 1.3660313428549113 and parameters: {'num_leaves': 1446, 'n_estimators': 1604, 'max_depth': 16, 'min_child_samples': 150, 'learning_rate': 0.17885730011834172, 'min_data_in_leaf': 20, 'feature_fraction': 0.9051739374669053}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:47,524][0m Trial 68 finished with value: 1.4035163779407274 and parameters: {'num_leaves': 1423, 'n_estimators': 1843, 'max_depth': 9, 'min_child_samples': 132, 'learning_rate': 0.009026290777955137, 'min_data_in_leaf': 11, 'feature_fraction': 0.79046409600517}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:54:52,055][0m Trial 69 finished with value: 1.6994376016479888 and parameters: {'num_leaves': 1330, 'n_estimators': 1757, 'max_depth': 11, 'min_child_samples': 121, 'learning_rate': 0.7298142868059517, 'min_data_in_leaf': 51, 'feature_fraction': 0.8270496032464169}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:55:33,296][0m Trial 70 finished with value: 1.541353690959196 and parameters: {'num_leaves': 1494, 'n_estimators': 1946, 'max_depth': 15, 'min_child_samples': 102, 'learning_rate': 0.06920773089041897, 'min_data_in_leaf': 4, 'feature_fraction': 0.8623023775462797}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:55:42,036][0m Trial 71 finished with value: 1.3563685642795607 and parameters: {'num_leaves': 1304, 'n_estimators': 1853, 'max_depth': 16, 'min_child_samples': 117, 'learning_rate': 0.2522680358172402, 'min_data_in_leaf': 35, 'feature_fraction': 0.9533333211931533}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:55:43,339][0m Trial 72 finished with value: 1.5232489785313592 and parameters: {'num_leaves': 1239, 'n_estimators': 1558, 'max_depth': 5, 'min_child_samples': 144, 'learning_rate': 0.16854594461614505, 'min_data_in_leaf': 20, 'feature_fraction': 0.9068244620449136}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:55:45,905][0m Trial 73 finished with value: 1.3217190553079614 and parameters: {'num_leaves': 1195, 'n_estimators': 1800, 'max_depth': 8, 'min_child_samples': 132, 'learning_rate': 0.21106815002696178, 'min_data_in_leaf': 46, 'feature_fraction': 0.8812425811220032}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:55:52,443][0m Trial 74 finished with value: 1.4497635761267527 and parameters: {'num_leaves': 224, 'n_estimators': 1954, 'max_depth': 13, 'min_child_samples': 169, 'learning_rate': 0.04996668644911925, 'min_data_in_leaf': 31, 'feature_fraction': 0.9756857229495234}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:03,811][0m Trial 75 finished with value: 1.364022068607569 and parameters: {'num_leaves': 1270, 'n_estimators': 1647, 'max_depth': 12, 'min_child_samples': 155, 'learning_rate': 0.085030376140282, 'min_data_in_leaf': 10, 'feature_fraction': 0.9228057292430698}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:13,324][0m Trial 76 finished with value: 1.4799939413182668 and parameters: {'num_leaves': 1443, 'n_estimators': 1723, 'max_depth': 17, 'min_child_samples': 83, 'learning_rate': 0.29765668908670617, 'min_data_in_leaf': 21, 'feature_fraction': 0.7225920886012255}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:17,107][0m Trial 77 finished with value: 1.49363189136944 and parameters: {'num_leaves': 1348, 'n_estimators': 1513, 'max_depth': 18, 'min_child_samples': 100, 'learning_rate': 0.11808918280794578, 'min_data_in_leaf': 57, 'feature_fraction': 0.7899557342769858}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:18,965][0m Trial 78 finished with value: 1.3752168014309196 and parameters: {'num_leaves': 1499, 'n_estimators': 992, 'max_depth': 9, 'min_child_samples': 185, 'learning_rate': 0.192981128838253, 'min_data_in_leaf': 42, 'feature_fraction': 0.8273822373802698}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:33,048][0m Trial 79 finished with value: 1.321883994978485 and parameters: {'num_leaves': 1401, 'n_estimators': 1859, 'max_depth': 14, 'min_child_samples': 38, 'learning_rate': 0.15520560285579638, 'min_data_in_leaf': 10, 'feature_fraction': 0.8656902388280522}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:42,634][0m Trial 80 finished with value: 1.3333284695389704 and parameters: {'num_leaves': 519, 'n_estimators': 1938, 'max_depth': 15, 'min_child_samples': 48, 'learning_rate': 0.031686074521567326, 'min_data_in_leaf': 22, 'feature_fraction': 0.9865057030735376}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:46,704][0m Trial 81 finished with value: 1.3591912290791959 and parameters: {'num_leaves': 1368, 'n_estimators': 1881, 'max_depth': 10, 'min_child_samples': 133, 'learning_rate': 0.13521025406497716, 'min_data_in_leaf': 30, 'feature_fraction': 0.7542533261871279}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:56:58,585][0m Trial 82 finished with value: 1.4447550826885687 and parameters: {'num_leaves': 1300, 'n_estimators': 1800, 'max_depth': 11, 'min_child_samples': 126, 'learning_rate': 0.254254484043325, 'min_data_in_leaf': 3, 'feature_fraction': 0.7406149636446904}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:02,629][0m Trial 83 finished with value: 1.309047178602242 and parameters: {'num_leaves': 1457, 'n_estimators': 1976, 'max_depth': 9, 'min_child_samples': 146, 'learning_rate': 0.10428408368650736, 'min_data_in_leaf': 27, 'feature_fraction': 0.7010488421200642}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:09,747][0m Trial 84 finished with value: 1.3394603125880988 and parameters: {'num_leaves': 1376, 'n_estimators': 1920, 'max_depth': 11, 'min_child_samples': 132, 'learning_rate': 0.3966130082004946, 'min_data_in_leaf': 38, 'feature_fraction': 0.9480352918192578}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:11,720][0m Trial 85 finished with value: 1.522717861850051 and parameters: {'num_leaves': 1405, 'n_estimators': 1436, 'max_depth': 7, 'min_child_samples': 15, 'learning_rate': 0.1517588599568542, 'min_data_in_leaf': 218, 'feature_fraction': 0.78154999583138}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:18,652][0m Trial 86 finished with value: 1.9178158200948432 and parameters: {'num_leaves': 1242, 'n_estimators': 1132, 'max_depth': 12, 'min_child_samples': 29, 'learning_rate': 0.9645548066925347, 'min_data_in_leaf': 16, 'feature_fraction': 0.8988459977133689}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:22,351][0m Trial 87 finished with value: 1.3460006318922526 and parameters: {'num_leaves': 1041, 'n_estimators': 1666, 'max_depth': 13, 'min_child_samples': 2, 'learning_rate': 0.07329980535886982, 'min_data_in_leaf': 77, 'feature_fraction': 0.8146773655574335}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:26,748][0m Trial 88 finished with value: 1.4293939369615312 and parameters: {'num_leaves': 1453, 'n_estimators': 1604, 'max_depth': 14, 'min_child_samples': 205, 'learning_rate': 0.22533861778000341, 'min_data_in_leaf': 56, 'feature_fraction': 0.8468272008911153}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:29,508][0m Trial 89 finished with value: 1.9241070364720199 and parameters: {'num_leaves': 1171, 'n_estimators': 1314, 'max_depth': 10, 'min_child_samples': 111, 'learning_rate': 0.0012106298041133084, 'min_data_in_leaf': 69, 'feature_fraction': 0.7726351785641045}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:49,010][0m Trial 90 finished with value: 1.3646788393660252 and parameters: {'num_leaves': 671, 'n_estimators': 1748, 'max_depth': 16, 'min_child_samples': 124, 'learning_rate': 0.19706117685444063, 'min_data_in_leaf': 8, 'feature_fraction': 0.8851446192200852}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:54,918][0m Trial 91 finished with value: 1.249647140359484 and parameters: {'num_leaves': 1405, 'n_estimators': 1788, 'max_depth': 10, 'min_child_samples': 117, 'learning_rate': 0.11482204716032343, 'min_data_in_leaf': 15, 'feature_fraction': 0.7636494980154119}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:57:57,439][0m Trial 92 finished with value: 1.3633784321631544 and parameters: {'num_leaves': 1303, 'n_estimators': 1821, 'max_depth': 9, 'min_child_samples': 90, 'learning_rate': 0.12346114683031581, 'min_data_in_leaf': 16, 'feature_fraction': 0.11011018545646606}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:00,275][0m Trial 93 finished with value: 1.4253117158689133 and parameters: {'num_leaves': 1350, 'n_estimators': 1995, 'max_depth': 7, 'min_child_samples': 138, 'learning_rate': 0.035064936015349005, 'min_data_in_leaf': 31, 'feature_fraction': 0.9229235865846321}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:02,139][0m Trial 94 finished with value: 1.3394292426808316 and parameters: {'num_leaves': 1395, 'n_estimators': 919, 'max_depth': 8, 'min_child_samples': 120, 'learning_rate': 0.09017618365145329, 'min_data_in_leaf': 24, 'feature_fraction': 0.6701728321782814}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:21,114][0m Trial 95 finished with value: 1.3704879678169686 and parameters: {'num_leaves': 1431, 'n_estimators': 1876, 'max_depth': 11, 'min_child_samples': 103, 'learning_rate': 0.060021087806730065, 'min_data_in_leaf': 2, 'feature_fraction': 0.8603363118075582}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:26,228][0m Trial 96 finished with value: 1.2557265285681447 and parameters: {'num_leaves': 1467, 'n_estimators': 1912, 'max_depth': 10, 'min_child_samples': 166, 'learning_rate': 0.17168876691495163, 'min_data_in_leaf': 42, 'feature_fraction': 0.708828363721257}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:32,976][0m Trial 97 finished with value: 1.4019915585431637 and parameters: {'num_leaves': 1480, 'n_estimators': 1693, 'max_depth': 18, 'min_child_samples': 163, 'learning_rate': 0.16839421570301558, 'min_data_in_leaf': 42, 'feature_fraction': 0.7095752631997853}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:41,740][0m Trial 98 finished with value: 1.4332307805586382 and parameters: {'num_leaves': 871, 'n_estimators': 1784, 'max_depth': 12, 'min_child_samples': 182, 'learning_rate': 0.11744610415560885, 'min_data_in_leaf': 14, 'feature_fraction': 0.8061499465737154}. Best is trial 43 with value: 1.2255143151507202.[0m




[32m[I 2021-09-25 21:58:46,672][0m Trial 99 finished with value: 1.3513143805058778 and parameters: {'num_leaves': 1465, 'n_estimators': 1923, 'max_depth': 13, 'min_child_samples': 170, 'learning_rate': 0.2374004905299345, 'min_data_in_leaf': 49, 'feature_fraction': 0.6817290030902752}. Best is trial 43 with value: 1.2255143151507202.[0m


In [21]:
params_lgbm = study.best_params
params_lgbm['objective'] = 'regression_l1'
params_lgbm

{'num_leaves': 1149,
 'n_estimators': 1161,
 'max_depth': 15,
 'min_child_samples': 50,
 'learning_rate': 0.04185199859407067,
 'min_data_in_leaf': 14,
 'feature_fraction': 0.6494305066563237,
 'objective': 'regression_l1'}

In [22]:
# заведем модель
model = lgbm.LGBMRegressor(**params_lgbm)

In [24]:
%%time

# параметры валидации, обучение будет идти на 5х10 фолдах
kFold_random_state = [13, 666, 228, 777, 42]
n_splits = 10

# финальный показатель метрики
final_loss = list()

# файл для записи финального результата
submission = test_submission.copy()
submission.iloc[:, 1] = 0


for ind_k, random_state in enumerate(kFold_random_state):
    kFold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    total_loss = list()

    for iteration, (train_index, valid_index) in enumerate(kFold.split(train, target)):

        X_train, X_valid = train.iloc[train_index].reset_index(drop=True), train.iloc[valid_index].reset_index(drop=True)
        y_train, y_valid = target[train_index], target[valid_index]

        model.fit(X_train, y_train)
        valid_pred = model.predict(X_valid)
        loss = deviation_metric(np.exp(y_valid), np.exp(valid_pred))

        predict = model.predict(test)
        submission['per_square_meter_price'] = submission['per_square_meter_price'] + np.exp(predict) / 50

        total_loss.append(np.mean(loss))

    final_loss.append(np.mean(total_loss))
    print(f'Fold({["1-10", "11-20", "21-30", "31-40", "41-50"][ind_k]}) deviation_metric: {np.mean(total_loss)}')
print(f'Final deviation_metric: {np.mean(final_loss)}')


Fold(1-10) deviation_metric: 1.2356818290967024
Fold(11-20) deviation_metric: 1.2524945826409912
Fold(21-30) deviation_metric: 1.2303302715286943
Fold(31-40) deviation_metric: 1.247937673580259
Fold(41-50) deviation_metric: 1.252988043050103
Final deviation_metric: 1.24388647997935
CPU times: user 59min 6s, sys: 4min 12s, total: 1h 3min 18s
Wall time: 8min 31s


In [29]:
np.mean(final_loss)

1.24388647997935

In [30]:
submission_raw = submission.copy()

In [31]:
# немного подрежем выбросы сверху
submission_raw['per_square_meter_price'] = submission_raw['per_square_meter_price'] * 0.9

submission_raw.loc[submission_raw['per_square_meter_price'] >= 200000, 'per_square_meter_price'] \
    = submission_raw.loc[submission_raw['per_square_meter_price'] >= 200000, 'per_square_meter_price'] * 0.9

In [32]:
submission_raw.to_csv('final_submission_log_target_and_feat_imp.csv', index=False)

In [None]:
#features_imp = pd.DataFrame({
#    'name' : train.columns,
#    'imp' : model.feature_importances_
#})
#features_imp.to_csv('9th_place_sol_feat_imp.csv', index=False)