In [1]:
import lightgbm as lgb

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_squared_error


In [2]:
df_train = pd.read_csv("processed/processed_train_2.csv")
df_test = pd.read_csv("processed/processed_test_2.csv")
df_train

Unnamed: 0,full_sq,life_sq,floor,max_floor,build_year,num_room,kitch_sq,state,area_m,raion_popul,...,railroad_1line_no,railroad_1line_yes,material_1.0,material_2.0,material_3.0,material_4.0,material_5.0,material_6.0,material_nan,price_doc
0,43,27.000000,4.0,12.559171,1879,1.909844,6.399244,2.105145,6.407578e+06,155572,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.668660e+06
1,34,19.000000,3.0,12.559171,1879,1.909844,6.399244,2.105145,9.589337e+06,115352,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.814010e+06
2,43,29.000000,2.0,12.559171,1879,1.909844,6.399244,2.105145,4.808270e+06,101708,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,5.523310e+06
3,89,50.000000,9.0,12.559171,1879,1.909844,6.399244,2.105145,1.258354e+07,178473,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.269391e+07
4,77,77.000000,4.0,12.559171,1879,1.909844,6.399244,2.105145,8.398461e+06,108171,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.582519e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30464,44,27.000000,7.0,9.000000,1975,2.000000,6.000000,3.000000,1.005305e+07,175518,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,7.170610e+06
30465,86,59.000000,3.0,9.000000,1935,4.000000,10.000000,3.000000,7.307411e+06,75377,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.422501e+07
30466,45,34.404467,10.0,20.000000,1879,1.000000,1.000000,1.000000,2.553630e+07,4001,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,6.754869e+06
30467,64,32.000000,5.0,15.000000,2003,2.000000,11.000000,2.000000,6.050065e+06,78616,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.308151e+07


In [3]:
X_train, X_val, y_train, y_val = train_test_split(df_train.drop(columns=['price_doc']),
                                                 df_train['price_doc'], test_size=0.2, random_state=42)

In [4]:
## https://neptune.ai/blog/lightgbm-parameters-guide

In [5]:
from sklearn.metrics import mean_squared_error
import optuna

def objective(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "verbosity": -1,
        "n_estimators": trial.suggest_int("n_estimators", 600, 1000),
        "boosting": trial.suggest_categorical("boosting", ["gbdt", "rf", "dart"]),
        "lambda_l2": trial.suggest_float("lambda_l2", 0, 10),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0, 1),
        "bagging_freq": 1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 2**10),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1),
        "max_depth": trial.suggest_int("max_depth", 1, 50),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 10, 100),
#         "max_bin": trial.suggest_int("min_data_in_leaf", 128, 512), 
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, verbose=False)
    predictions = model.predict(X_val)
    rmse = mean_squared_error(y_val, predictions, squared=False)
    return rmse

In [6]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

[I 2023-10-31 23:46:19,321] A new study created in memory with name: no-name-31188165-b9ef-40a2-9a69-cd4017794e9f




[I 2023-10-31 23:46:24,710] Trial 0 finished with value: 2863400.6909811897 and parameters: {'n_estimators': 884, 'boosting': 'rf', 'lambda_l2': 4.549772008900098, 'bagging_fraction': 0.6609260239396396, 'num_leaves': 369, 'feature_fraction': 0.7120774182829188, 'max_depth': 8, 'learning_rate': 0.00946101777543181, 'subsample': 0.7164299078572999, 'colsample_bytree': 0.056934028582603824, 'min_data_in_leaf': 91}. Best is trial 0 with value: 2863400.6909811897.




[I 2023-10-31 23:46:31,463] Trial 1 finished with value: 2845374.8494213633 and parameters: {'n_estimators': 648, 'boosting': 'rf', 'lambda_l2': 6.080764956985289, 'bagging_fraction': 0.7210517631410952, 'num_leaves': 676, 'feature_fraction': 0.6040203111978999, 'max_depth': 24, 'learning_rate': 0.05700731900442083, 'subsample': 0.9402151401080291, 'colsample_bytree': 0.3898370022175877, 'min_data_in_leaf': 50}. Best is trial 1 with value: 2845374.8494213633.




[I 2023-10-31 23:46:32,769] Trial 2 finished with value: 3925391.5552007584 and parameters: {'n_estimators': 829, 'boosting': 'rf', 'lambda_l2': 7.316629576915456, 'bagging_fraction': 0.7001312703763286, 'num_leaves': 437, 'feature_fraction': 0.6617248478082394, 'max_depth': 1, 'learning_rate': 0.0011236986784880388, 'subsample': 0.9355455264163016, 'colsample_bytree': 0.8985230054639697, 'min_data_in_leaf': 81}. Best is trial 1 with value: 2845374.8494213633.




[I 2023-10-31 23:46:45,022] Trial 3 finished with value: 2751805.617122333 and parameters: {'n_estimators': 730, 'boosting': 'rf', 'lambda_l2': 0.8626060741267128, 'bagging_fraction': 0.9527760014003483, 'num_leaves': 235, 'feature_fraction': 0.8074450285973257, 'max_depth': 38, 'learning_rate': 0.003488198594690326, 'subsample': 0.7195337099605792, 'colsample_bytree': 0.45951255747976044, 'min_data_in_leaf': 58}. Best is trial 3 with value: 2751805.617122333.




[I 2023-10-31 23:46:47,901] Trial 4 finished with value: 2971396.746552503 and parameters: {'n_estimators': 747, 'boosting': 'rf', 'lambda_l2': 5.020933069747892, 'bagging_fraction': 0.26729401193494884, 'num_leaves': 656, 'feature_fraction': 0.7150990039135441, 'max_depth': 8, 'learning_rate': 0.007786817825468732, 'subsample': 0.7422737784667073, 'colsample_bytree': 0.922962123715317, 'min_data_in_leaf': 81}. Best is trial 3 with value: 2751805.617122333.




[I 2023-10-31 23:46:48,773] Trial 5 finished with value: 3068661.0462123263 and parameters: {'n_estimators': 640, 'boosting': 'gbdt', 'lambda_l2': 5.195989934710149, 'bagging_fraction': 0.37156873309448024, 'num_leaves': 444, 'feature_fraction': 0.6913155837579501, 'max_depth': 1, 'learning_rate': 0.01477355018305495, 'subsample': 0.9641132306812399, 'colsample_bytree': 0.1905189642893505, 'min_data_in_leaf': 56}. Best is trial 3 with value: 2751805.617122333.




[I 2023-10-31 23:46:49,902] Trial 6 finished with value: 3190107.553936197 and parameters: {'n_estimators': 764, 'boosting': 'rf', 'lambda_l2': 1.9462338016494596, 'bagging_fraction': 0.19739168492344306, 'num_leaves': 833, 'feature_fraction': 0.6866771480137023, 'max_depth': 3, 'learning_rate': 0.04815868993090242, 'subsample': 0.2144719271451614, 'colsample_bytree': 0.19583804018552387, 'min_data_in_leaf': 89}. Best is trial 3 with value: 2751805.617122333.




[I 2023-10-31 23:46:58,019] Trial 7 finished with value: 2559889.5175262755 and parameters: {'n_estimators': 840, 'boosting': 'gbdt', 'lambda_l2': 2.1540304272924935, 'bagging_fraction': 0.4778138012297697, 'num_leaves': 909, 'feature_fraction': 0.557786250850199, 'max_depth': 32, 'learning_rate': 0.02020461429841087, 'subsample': 0.1401062272652583, 'colsample_bytree': 0.4420997404979104, 'min_data_in_leaf': 69}. Best is trial 7 with value: 2559889.5175262755.




[I 2023-10-31 23:46:59,906] Trial 8 finished with value: 3106018.5450713034 and parameters: {'n_estimators': 629, 'boosting': 'rf', 'lambda_l2': 8.488263818098266, 'bagging_fraction': 0.1934201953651662, 'num_leaves': 248, 'feature_fraction': 0.5941955951567797, 'max_depth': 5, 'learning_rate': 0.00356616910606939, 'subsample': 0.3694691996127346, 'colsample_bytree': 0.9719760053687785, 'min_data_in_leaf': 69}. Best is trial 7 with value: 2559889.5175262755.




[I 2023-10-31 23:47:07,667] Trial 9 finished with value: 2556486.5598787987 and parameters: {'n_estimators': 722, 'boosting': 'gbdt', 'lambda_l2': 4.766488000333901, 'bagging_fraction': 0.34029079298098897, 'num_leaves': 202, 'feature_fraction': 0.6918543179634338, 'max_depth': 11, 'learning_rate': 0.014685714888192957, 'subsample': 0.22102738559923885, 'colsample_bytree': 0.14555173454518583, 'min_data_in_leaf': 55}. Best is trial 9 with value: 2556486.5598787987.




[I 2023-10-31 23:47:16,332] Trial 10 finished with value: 7461566.581810966 and parameters: {'n_estimators': 977, 'boosting': 'dart', 'lambda_l2': 9.8771721499795, 'bagging_fraction': 0.0305031030090539, 'num_leaves': 8, 'feature_fraction': 0.9450830746277032, 'max_depth': 50, 'learning_rate': 0.07426478816386113, 'subsample': 0.07269914240647646, 'colsample_bytree': 0.6482027444855839, 'min_data_in_leaf': 24}. Best is trial 9 with value: 2556486.5598787987.




[I 2023-10-31 23:47:25,189] Trial 11 finished with value: 2578647.282990236 and parameters: {'n_estimators': 858, 'boosting': 'gbdt', 'lambda_l2': 3.29953639807103, 'bagging_fraction': 0.47337703582044055, 'num_leaves': 925, 'feature_fraction': 0.5272091149044836, 'max_depth': 21, 'learning_rate': 0.02737859810426921, 'subsample': 0.27846998328514994, 'colsample_bytree': 0.31766467754817707, 'min_data_in_leaf': 34}. Best is trial 9 with value: 2556486.5598787987.




[I 2023-10-31 23:47:36,429] Trial 12 finished with value: 2550037.9282531478 and parameters: {'n_estimators': 927, 'boosting': 'gbdt', 'lambda_l2': 2.7075749015503976, 'bagging_fraction': 0.46212477478478925, 'num_leaves': 1018, 'feature_fraction': 0.5268111166778772, 'max_depth': 17, 'learning_rate': 0.023186041147684088, 'subsample': 0.0833060135506827, 'colsample_bytree': 0.5731802318215027, 'min_data_in_leaf': 41}. Best is trial 12 with value: 2550037.9282531478.




[I 2023-10-31 23:47:41,117] Trial 13 finished with value: 2543674.1459705946 and parameters: {'n_estimators': 954, 'boosting': 'gbdt', 'lambda_l2': 3.4924158306742084, 'bagging_fraction': 0.3620401843801442, 'num_leaves': 11, 'feature_fraction': 0.5291591896225345, 'max_depth': 15, 'learning_rate': 0.03233506291440375, 'subsample': 0.07658616186825701, 'colsample_bytree': 0.6146341902440479, 'min_data_in_leaf': 34}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:47:45,528] Trial 14 finished with value: 2570558.405519616 and parameters: {'n_estimators': 999, 'boosting': 'gbdt', 'lambda_l2': 0.12332424275952825, 'bagging_fraction': 0.5741527188231365, 'num_leaves': 9, 'feature_fraction': 0.5207912556316326, 'max_depth': 16, 'learning_rate': 0.09548027110140685, 'subsample': 0.051875131559441845, 'colsample_bytree': 0.679681894261704, 'min_data_in_leaf': 17}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:48:14,006] Trial 15 finished with value: 2615317.5468316167 and parameters: {'n_estimators': 927, 'boosting': 'dart', 'lambda_l2': 2.9261277279830282, 'bagging_fraction': 0.43644925743024887, 'num_leaves': 717, 'feature_fraction': 0.5225209913369183, 'max_depth': 16, 'learning_rate': 0.03207684070522562, 'subsample': 0.3964016896950622, 'colsample_bytree': 0.6070202159806803, 'min_data_in_leaf': 37}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:48:24,690] Trial 16 finished with value: 2602195.2760014185 and parameters: {'n_estimators': 918, 'boosting': 'gbdt', 'lambda_l2': 3.562553785516128, 'bagging_fraction': 0.5325899597695471, 'num_leaves': 595, 'feature_fraction': 0.61112974183797, 'max_depth': 30, 'learning_rate': 0.03991988316424315, 'subsample': 0.050242877861677894, 'colsample_bytree': 0.7464574402119944, 'min_data_in_leaf': 39}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:48:36,072] Trial 17 finished with value: 2678000.3063820023 and parameters: {'n_estimators': 941, 'boosting': 'gbdt', 'lambda_l2': 1.4948097864672416, 'bagging_fraction': 0.3549354290413654, 'num_leaves': 1012, 'feature_fraction': 0.5755520917911177, 'max_depth': 17, 'learning_rate': 0.026184462237095498, 'subsample': 0.15940048568692783, 'colsample_bytree': 0.5543114657470196, 'min_data_in_leaf': 14}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:48:42,232] Trial 18 finished with value: 2739663.1138139926 and parameters: {'n_estimators': 960, 'boosting': 'gbdt', 'lambda_l2': 2.861526647515919, 'bagging_fraction': 0.0874015384252001, 'num_leaves': 127, 'feature_fraction': 0.5192385818600354, 'max_depth': 29, 'learning_rate': 0.05064150902987403, 'subsample': 0.32228453015394276, 'colsample_bytree': 0.5257517283237202, 'min_data_in_leaf': 28}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:48:58,171] Trial 19 finished with value: 2587231.8313773023 and parameters: {'n_estimators': 894, 'boosting': 'dart', 'lambda_l2': 3.995969944008705, 'bagging_fraction': 0.27919570211973976, 'num_leaves': 771, 'feature_fraction': 0.5026467374095397, 'max_depth': 12, 'learning_rate': 0.08436016549192175, 'subsample': 0.44770591263499804, 'colsample_bytree': 0.7606151643361236, 'min_data_in_leaf': 45}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:12,905] Trial 20 finished with value: 2652836.2855748087 and parameters: {'n_estimators': 795, 'boosting': 'gbdt', 'lambda_l2': 2.4035399535438233, 'bagging_fraction': 0.5899680518096617, 'num_leaves': 543, 'feature_fraction': 0.6318935112482494, 'max_depth': 38, 'learning_rate': 0.037407721170942376, 'subsample': 0.27894074291354787, 'colsample_bytree': 0.8073792675612996, 'min_data_in_leaf': 23}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:16,250] Trial 21 finished with value: 2549954.6279778695 and parameters: {'n_estimators': 694, 'boosting': 'gbdt', 'lambda_l2': 4.507272078798394, 'bagging_fraction': 0.3911332261532661, 'num_leaves': 153, 'feature_fraction': 0.7720274440732794, 'max_depth': 11, 'learning_rate': 0.01865793665410861, 'subsample': 0.1865505481107415, 'colsample_bytree': 0.5846927549661936, 'min_data_in_leaf': 63}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:21,539] Trial 22 finished with value: 2545921.1296109846 and parameters: {'n_estimators': 691, 'boosting': 'gbdt', 'lambda_l2': 3.9454682121534046, 'bagging_fraction': 0.42354930339467983, 'num_leaves': 95, 'feature_fraction': 0.7790443722325884, 'max_depth': 21, 'learning_rate': 0.018877941831227717, 'subsample': 0.15076545562924135, 'colsample_bytree': 0.5824325433558268, 'min_data_in_leaf': 67}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:26,618] Trial 23 finished with value: 2554513.4585848358 and parameters: {'n_estimators': 683, 'boosting': 'gbdt', 'lambda_l2': 3.9690790473884956, 'bagging_fraction': 0.39757677154489823, 'num_leaves': 106, 'feature_fraction': 0.7820389214847312, 'max_depth': 22, 'learning_rate': 0.016910463826061886, 'subsample': 0.169918802510154, 'colsample_bytree': 0.6819859252964456, 'min_data_in_leaf': 65}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:29,782] Trial 24 finished with value: 2586460.583877783 and parameters: {'n_estimators': 685, 'boosting': 'gbdt', 'lambda_l2': 5.930084008967265, 'bagging_fraction': 0.2982322966891817, 'num_leaves': 104, 'feature_fraction': 0.7770614477417034, 'max_depth': 12, 'learning_rate': 0.013874873655126574, 'subsample': 0.2252602695426389, 'colsample_bytree': 0.5936015247003733, 'min_data_in_leaf': 63}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:34,744] Trial 25 finished with value: 2588932.0307640834 and parameters: {'n_estimators': 692, 'boosting': 'gbdt', 'lambda_l2': 3.6563584384747654, 'bagging_fraction': 0.4097618634854202, 'num_leaves': 318, 'feature_fraction': 0.8401694120224635, 'max_depth': 20, 'learning_rate': 0.032829281895816104, 'subsample': 0.1430157708014631, 'colsample_bytree': 0.49159105076464366, 'min_data_in_leaf': 77}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:41,292] Trial 26 finished with value: 2954829.5094460016 and parameters: {'n_estimators': 607, 'boosting': 'dart', 'lambda_l2': 4.152728853399619, 'bagging_fraction': 0.20066454240201517, 'num_leaves': 165, 'feature_fraction': 0.7455844675158699, 'max_depth': 28, 'learning_rate': 0.02223154872572287, 'subsample': 0.28453258983470386, 'colsample_bytree': 0.529919939765005, 'min_data_in_leaf': 99}. Best is trial 13 with value: 2543674.1459705946.




[I 2023-10-31 23:49:46,034] Trial 27 finished with value: 2526666.188072689 and parameters: {'n_estimators': 782, 'boosting': 'gbdt', 'lambda_l2': 5.595392677159854, 'bagging_fraction': 0.5505931871388583, 'num_leaves': 63, 'feature_fraction': 0.8505571073668988, 'max_depth': 13, 'learning_rate': 0.011808676657772877, 'subsample': 0.12026016550567586, 'colsample_bytree': 0.6356072325123403, 'min_data_in_leaf': 50}. Best is trial 27 with value: 2526666.188072689.




[I 2023-10-31 23:49:50,197] Trial 28 finished with value: 2519846.4308479535 and parameters: {'n_estimators': 790, 'boosting': 'gbdt', 'lambda_l2': 5.578385317628879, 'bagging_fraction': 0.5302233659352622, 'num_leaves': 40, 'feature_fraction': 0.8531415188033153, 'max_depth': 35, 'learning_rate': 0.009004057463787135, 'subsample': 0.4919418238102024, 'colsample_bytree': 0.658324129092106, 'min_data_in_leaf': 47}. Best is trial 28 with value: 2519846.4308479535.




[I 2023-10-31 23:50:02,073] Trial 29 finished with value: 2515807.602266377 and parameters: {'n_estimators': 800, 'boosting': 'gbdt', 'lambda_l2': 5.996982775064544, 'bagging_fraction': 0.6450399761076904, 'num_leaves': 284, 'feature_fraction': 0.865036163633459, 'max_depth': 35, 'learning_rate': 0.010202931921913601, 'subsample': 0.4707445739952065, 'colsample_bytree': 0.6624560778482443, 'min_data_in_leaf': 45}. Best is trial 29 with value: 2515807.602266377.




[I 2023-10-31 23:50:18,959] Trial 30 finished with value: 3127859.098819492 and parameters: {'n_estimators': 795, 'boosting': 'dart', 'lambda_l2': 6.070371987949416, 'bagging_fraction': 0.627704496043268, 'num_leaves': 303, 'feature_fraction': 0.880008330215021, 'max_depth': 37, 'learning_rate': 0.009129331541421409, 'subsample': 0.5860702318204281, 'colsample_bytree': 0.7182372509933835, 'min_data_in_leaf': 48}. Best is trial 29 with value: 2515807.602266377.




[I 2023-10-31 23:50:22,157] Trial 31 finished with value: 2531066.864046717 and parameters: {'n_estimators': 777, 'boosting': 'gbdt', 'lambda_l2': 5.539326426336983, 'bagging_fraction': 0.5145247426613451, 'num_leaves': 28, 'feature_fraction': 0.8843084651588956, 'max_depth': 43, 'learning_rate': 0.011264824488949906, 'subsample': 0.5069514215845916, 'colsample_bytree': 0.6472416037996005, 'min_data_in_leaf': 31}. Best is trial 29 with value: 2515807.602266377.




[I 2023-10-31 23:50:27,534] Trial 32 finished with value: 2515081.7474339763 and parameters: {'n_estimators': 777, 'boosting': 'gbdt', 'lambda_l2': 5.580160779406434, 'bagging_fraction': 0.5316225412489028, 'num_leaves': 63, 'feature_fraction': 0.8834161837506106, 'max_depth': 44, 'learning_rate': 0.011089672956973792, 'subsample': 0.49566146601559297, 'colsample_bytree': 0.675113736504296, 'min_data_in_leaf': 44}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:50:40,843] Trial 33 finished with value: 2524048.4947136063 and parameters: {'n_estimators': 827, 'boosting': 'gbdt', 'lambda_l2': 6.5967942449494235, 'bagging_fraction': 0.6635588405328683, 'num_leaves': 342, 'feature_fraction': 0.9309273797510103, 'max_depth': 44, 'learning_rate': 0.007089313652312696, 'subsample': 0.5712918944837906, 'colsample_bytree': 0.815837004678386, 'min_data_in_leaf': 49}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:51:01,066] Trial 34 finished with value: 2515491.071883315 and parameters: {'n_estimators': 826, 'boosting': 'gbdt', 'lambda_l2': 6.699988333688191, 'bagging_fraction': 0.7276119914030368, 'num_leaves': 404, 'feature_fraction': 0.950291759271441, 'max_depth': 45, 'learning_rate': 0.006580515220019528, 'subsample': 0.5620852772020885, 'colsample_bytree': 0.7852122111593142, 'min_data_in_leaf': 45}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:51:17,349] Trial 35 finished with value: 2522823.25953767 and parameters: {'n_estimators': 818, 'boosting': 'gbdt', 'lambda_l2': 6.696570622933998, 'bagging_fraction': 0.7598979555802613, 'num_leaves': 426, 'feature_fraction': 0.9983648045026464, 'max_depth': 43, 'learning_rate': 0.006575530962368297, 'subsample': 0.4896105631727351, 'colsample_bytree': 0.8056062333097402, 'min_data_in_leaf': 45}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:51:34,666] Trial 36 finished with value: 2803519.858019527 and parameters: {'n_estimators': 869, 'boosting': 'rf', 'lambda_l2': 7.269333134981618, 'bagging_fraction': 0.7507881658702188, 'num_leaves': 389, 'feature_fraction': 0.8309877471983551, 'max_depth': 50, 'learning_rate': 0.00557203965673906, 'subsample': 0.6305100163686141, 'colsample_bytree': 0.7088310543640504, 'min_data_in_leaf': 43}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:51:50,480] Trial 37 finished with value: 2554009.187686665 and parameters: {'n_estimators': 748, 'boosting': 'gbdt', 'lambda_l2': 5.389224116017548, 'bagging_fraction': 0.7079283749194386, 'num_leaves': 262, 'feature_fraction': 0.898087411864533, 'max_depth': 35, 'learning_rate': 0.005226844026987056, 'subsample': 0.4447575624328822, 'colsample_bytree': 0.8709507230735815, 'min_data_in_leaf': 53}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:52:04,060] Trial 38 finished with value: 2777595.179276292 and parameters: {'n_estimators': 808, 'boosting': 'rf', 'lambda_l2': 4.857048509470389, 'bagging_fraction': 0.8070767697362164, 'num_leaves': 499, 'feature_fraction': 0.8646897335370932, 'max_depth': 46, 'learning_rate': 0.008458708760337136, 'subsample': 0.6553202218938636, 'colsample_bytree': 0.7536231781890387, 'min_data_in_leaf': 60}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:52:15,952] Trial 39 finished with value: 2541410.6024796832 and parameters: {'n_estimators': 847, 'boosting': 'gbdt', 'lambda_l2': 6.4020595133024765, 'bagging_fraction': 0.6167654689827647, 'num_leaves': 195, 'feature_fraction': 0.8207732157349579, 'max_depth': 40, 'learning_rate': 0.01117575025982837, 'subsample': 0.5662964303341868, 'colsample_bytree': 0.8532227077147667, 'min_data_in_leaf': 53}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:52:26,291] Trial 40 finished with value: 2552171.0963601107 and parameters: {'n_estimators': 729, 'boosting': 'gbdt', 'lambda_l2': 7.18817154513973, 'bagging_fraction': 0.6673880768191095, 'num_leaves': 504, 'feature_fraction': 0.9130041836001075, 'max_depth': 47, 'learning_rate': 0.00924056963887123, 'subsample': 0.5195708198816987, 'colsample_bytree': 0.89955567383473, 'min_data_in_leaf': 75}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:52:43,252] Trial 41 finished with value: 2524226.7434377056 and parameters: {'n_estimators': 818, 'boosting': 'gbdt', 'lambda_l2': 6.683305620020365, 'bagging_fraction': 0.8212696258781176, 'num_leaves': 408, 'feature_fraction': 0.983350995216822, 'max_depth': 41, 'learning_rate': 0.005986836676527584, 'subsample': 0.46707776001928214, 'colsample_bytree': 0.7754404828797333, 'min_data_in_leaf': 46}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:52:55,919] Trial 42 finished with value: 2562679.4217664907 and parameters: {'n_estimators': 760, 'boosting': 'gbdt', 'lambda_l2': 5.822147763542464, 'bagging_fraction': 0.6155854804109671, 'num_leaves': 439, 'feature_fraction': 0.9960099363375609, 'max_depth': 33, 'learning_rate': 0.004689600679612642, 'subsample': 0.49143050050678444, 'colsample_bytree': 0.703699978973796, 'min_data_in_leaf': 42}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:53:11,614] Trial 43 finished with value: 2530660.159102656 and parameters: {'n_estimators': 891, 'boosting': 'gbdt', 'lambda_l2': 5.054138612257448, 'bagging_fraction': 0.7590511648471931, 'num_leaves': 284, 'feature_fraction': 0.9601439395020058, 'max_depth': 42, 'learning_rate': 0.006908704650028227, 'subsample': 0.40285699029297795, 'colsample_bytree': 0.8188882858763519, 'min_data_in_leaf': 58}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:53:20,247] Trial 44 finished with value: 2805612.6597282705 and parameters: {'n_estimators': 836, 'boosting': 'rf', 'lambda_l2': 7.652109160395393, 'bagging_fraction': 0.5303940669725868, 'num_leaves': 363, 'feature_fraction': 0.9263015501077533, 'max_depth': 47, 'learning_rate': 0.0043384817549882585, 'subsample': 0.5426464306437522, 'colsample_bytree': 0.9915021313937105, 'min_data_in_leaf': 52}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:53:35,397] Trial 45 finished with value: 2530877.3669480677 and parameters: {'n_estimators': 811, 'boosting': 'gbdt', 'lambda_l2': 6.231308271492301, 'bagging_fraction': 0.6861844607747319, 'num_leaves': 576, 'feature_fraction': 0.9617893672594334, 'max_depth': 35, 'learning_rate': 0.007451088840458703, 'subsample': 0.48772192830511046, 'colsample_bytree': 0.9288963294617173, 'min_data_in_leaf': 38}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:53:47,042] Trial 46 finished with value: 2661254.411302382 and parameters: {'n_estimators': 869, 'boosting': 'gbdt', 'lambda_l2': 5.4173038574118815, 'bagging_fraction': 0.49575455244554634, 'num_leaves': 220, 'feature_fraction': 0.9057316918462048, 'max_depth': 40, 'learning_rate': 0.0028337163767740694, 'subsample': 0.42850180076637767, 'colsample_bytree': 0.6661812595980686, 'min_data_in_leaf': 34}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:53:59,049] Trial 47 finished with value: 2519907.242566575 and parameters: {'n_estimators': 772, 'boosting': 'gbdt', 'lambda_l2': 6.767121130851216, 'bagging_fraction': 0.5814535678607654, 'num_leaves': 453, 'feature_fraction': 0.9375877358593894, 'max_depth': 45, 'learning_rate': 0.01293251578625176, 'subsample': 0.3631421483445886, 'colsample_bytree': 0.7334319494394973, 'min_data_in_leaf': 45}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:54:16,787] Trial 48 finished with value: 2563895.2229857943 and parameters: {'n_estimators': 785, 'boosting': 'gbdt', 'lambda_l2': 4.546752854781143, 'bagging_fraction': 0.5794850195521745, 'num_leaves': 625, 'feature_fraction': 0.8853694514323546, 'max_depth': 45, 'learning_rate': 0.013446294642321199, 'subsample': 0.35875146979540595, 'colsample_bytree': 0.7171966273556752, 'min_data_in_leaf': 26}. Best is trial 32 with value: 2515081.7474339763.




[I 2023-10-31 23:54:32,367] Trial 49 finished with value: 3077874.192933537 and parameters: {'n_estimators': 761, 'boosting': 'dart', 'lambda_l2': 5.140712736705392, 'bagging_fraction': 0.470501202096104, 'num_leaves': 491, 'feature_fraction': 0.935445572458924, 'max_depth': 25, 'learning_rate': 0.00992953211803417, 'subsample': 0.4206566649992301, 'colsample_bytree': 0.6185137577726177, 'min_data_in_leaf': 37}. Best is trial 32 with value: 2515081.7474339763.


In [7]:
print('Best hyperparameters:', study.best_params)
print('Best RMSE:', study.best_value)

Best hyperparameters: {'n_estimators': 777, 'boosting': 'gbdt', 'lambda_l2': 5.580160779406434, 'bagging_fraction': 0.5316225412489028, 'num_leaves': 63, 'feature_fraction': 0.8834161837506106, 'max_depth': 44, 'learning_rate': 0.011089672956973792, 'subsample': 0.49566146601559297, 'colsample_bytree': 0.675113736504296, 'min_data_in_leaf': 44}
Best RMSE: 2515081.7474339763


In [8]:
X = df_train.drop(['price_doc'], axis=1)
y = df_train['price_doc']

In [9]:
params = study.best_params
params["objective"] = "regression"
params["metric"] = "rmse"
params["verbosity"] = -1

In [10]:
model = lgb.LGBMRegressor(**params)

model.fit(X, y)



In [11]:
df_test_copy = df_test.copy()

df_test = df_test.drop(['id'], axis=1)

In [12]:
pred = model.predict(df_test)
prediction_df = pd.DataFrame({
    'id': df_test_copy['id'],
    'price_doc': pred
})
prediction_df

Unnamed: 0,id,price_doc
0,30474,5.446781e+06
1,30475,8.053563e+06
2,30476,5.218428e+06
3,30477,5.697789e+06
4,30478,4.972711e+06
...,...,...
7657,38131,8.018036e+06
7658,38132,4.971977e+06
7659,38133,4.879705e+06
7660,38134,5.381328e+06


In [13]:
prediction_df.to_csv('./output/lgbm_pred_seed.csv', index=False)