In [89]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from joblib import dump, load
def validate(model, X_train, X_valid, y_train, y_valid):
    '''
    return validation scores of training data & validation data
    '''
    preds_valid = model.predict(X_valid)
    preds_train = model.predict(X_train)
    mae_valid = mean_absolute_error(np.exp(preds_valid), y_valid)
    mae_train = mean_absolute_error(np.exp(preds_train), y_train)
    print("Validation result:")
    print("train set mae on training set is {}".format(mae_train))
    print("valid set mae on validation set is {}".format(mae_valid))

In [90]:
data = pd.read_csv('./data/used_car_train_20200313_revised.csv', index_col='SaleID')
X = data.copy()
X.head()

Unnamed: 0_level_0,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,736,20040402,30.0,6,1.0,0.0,0.0,60,12.5,0.0,...,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762
1,2262,20030301,40.0,1,2.0,0.0,0.0,0,15.0,-,...,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522
2,14874,20040403,115.0,15,1.0,0.0,0.0,163,12.5,0.0,...,0.25141,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.56533,-0.832687,-0.229963
3,71865,19960908,109.0,10,0.0,0.0,1.0,193,15.0,0.0,...,0.274293,0.1103,0.121964,0.033395,0.0,-4.509599,1.28594,-0.501868,-2.438353,-0.478699
4,111080,20120103,110.0,5,1.0,0.0,0.0,68,5.0,0.0,...,0.228036,0.073205,0.09188,0.078819,0.121534,-1.89624,0.910783,0.93111,2.834518,1.923482


In [91]:
# date fetures process
date_cols = ['regDate', 'creatDate']
for col in date_cols:
    X[col] = pd.to_datetime(X[col], format='%Y%m%d',errors='coerce')
    X[col+'_month'] = X[col].dt.month
    X[col+'_day'] = X[col].dt.day
    X[col+'_year'] = X[col].dt.year

X.head()

Unnamed: 0_level_0,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,...,v_11,v_12,v_13,v_14,regDate_month,regDate_day,regDate_year,creatDate_month,creatDate_day,creatDate_year
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,736,2004-04-02,30.0,6,1.0,0.0,0.0,60,12.5,0.0,...,2.804097,-2.420821,0.795292,0.914762,4.0,2.0,2004.0,4,4,2016
1,2262,2003-03-01,40.0,1,2.0,0.0,0.0,0,15.0,-,...,2.096338,-1.030483,-1.722674,0.245522,3.0,1.0,2003.0,3,9,2016
2,14874,2004-04-03,115.0,15,1.0,0.0,0.0,163,12.5,0.0,...,1.803559,1.56533,-0.832687,-0.229963,4.0,3.0,2004.0,4,2,2016
3,71865,1996-09-08,109.0,10,0.0,0.0,1.0,193,15.0,0.0,...,1.28594,-0.501868,-2.438353,-0.478699,9.0,8.0,1996.0,3,12,2016
4,111080,2012-01-03,110.0,5,1.0,0.0,0.0,68,5.0,0.0,...,0.910783,0.93111,2.834518,1.923482,1.0,3.0,2012.0,3,13,2016


In [93]:
# categorical features process
bad_cols = ['seller', 'offerType']
cat_cols = ['model','brand','bodyType','fuelType','gearbox','notRepairedDamage']

X['notRepairedDamage'] = X['notRepairedDamage'].replace('-','NaN').astype('float64')
X.drop(bad_cols,inplace=True, axis=1)
X.drop(date_cols, inplace=True, axis=1)

y = X['price']
X.drop('price', inplace=True, axis=1)
X.head()

Unnamed: 0_level_0,name,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,...,v_11,v_12,v_13,v_14,regDate_month,regDate_day,regDate_year,creatDate_month,creatDate_day,creatDate_year
SaleID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,736,30.0,6,1.0,0.0,0.0,60,12.5,0.0,1046,...,2.804097,-2.420821,0.795292,0.914762,4.0,2.0,2004.0,4,4,2016
1,2262,40.0,1,2.0,0.0,0.0,0,15.0,,4366,...,2.096338,-1.030483,-1.722674,0.245522,3.0,1.0,2003.0,3,9,2016
2,14874,115.0,15,1.0,0.0,0.0,163,12.5,0.0,2806,...,1.803559,1.56533,-0.832687,-0.229963,4.0,3.0,2004.0,4,2,2016
3,71865,109.0,10,0.0,0.0,1.0,193,15.0,0.0,434,...,1.28594,-0.501868,-2.438353,-0.478699,9.0,8.0,1996.0,3,12,2016
4,111080,110.0,5,1.0,0.0,0.0,68,5.0,0.0,6977,...,0.910783,0.93111,2.834518,1.923482,1.0,3.0,2012.0,3,13,2016


In [94]:
%%time
from lightgbm import LGBMRegressor
#Do NOT modify code blocks above
#train baseline

# - Baseline Performance Summary:

# Early stopping, best iteration is:
# [5146]	valid_0's l1: 522.858
# CPU times: user 4min 14s, sys: 5.65 s, total: 4min 20s
# Wall time: 1min 24s

(X_train, X_valid, y_train, y_valid) = train_test_split(X, y, test_size=0.1)

CPU times: user 48.4 ms, sys: 4.52 ms, total: 53 ms
Wall time: 52.9 ms


In [8]:
lgb_model = LGBMRegressor(n_estimators=10000, 
                          n_jobs=4, 
                          random_state=42,
                          objective='mae')

lgb_model.fit(X_train, y_train,
              eval_set=[(X_valid, y_valid)], 
              eval_metric='mae', 
              early_stopping_rounds=42)

[1]	valid_0's l1: 4183.89
Training until validation scores don't improve for 42 rounds
[2]	valid_0's l1: 3878.8
[3]	valid_0's l1: 3613.09
[4]	valid_0's l1: 3356.12
[5]	valid_0's l1: 3137.46
[6]	valid_0's l1: 2930.05
[7]	valid_0's l1: 2740.26
[8]	valid_0's l1: 2569.39
[9]	valid_0's l1: 2412.03
[10]	valid_0's l1: 2270.85
[11]	valid_0's l1: 2140.7
[12]	valid_0's l1: 2025.01
[13]	valid_0's l1: 1913.41
[14]	valid_0's l1: 1815.27
[15]	valid_0's l1: 1728.63
[16]	valid_0's l1: 1647.63
[17]	valid_0's l1: 1574.33
[18]	valid_0's l1: 1505.9
[19]	valid_0's l1: 1443.94
[20]	valid_0's l1: 1395.66
[21]	valid_0's l1: 1344.03
[22]	valid_0's l1: 1307.21
[23]	valid_0's l1: 1262.19
[24]	valid_0's l1: 1227.87
[25]	valid_0's l1: 1191.88
[26]	valid_0's l1: 1166.21
[27]	valid_0's l1: 1140.69
[28]	valid_0's l1: 1114.33
[29]	valid_0's l1: 1091.48
[30]	valid_0's l1: 1071.03
[31]	valid_0's l1: 1051.67
[32]	valid_0's l1: 1034.83
[33]	valid_0's l1: 1015.96
[34]	valid_0's l1: 1002.3
[35]	valid_0's l1: 989.012
[36]	va

[300]	valid_0's l1: 616.784
[301]	valid_0's l1: 616.107
[302]	valid_0's l1: 615.812
[303]	valid_0's l1: 615.432
[304]	valid_0's l1: 614.84
[305]	valid_0's l1: 614.723
[306]	valid_0's l1: 614.438
[307]	valid_0's l1: 614.187
[308]	valid_0's l1: 614.086
[309]	valid_0's l1: 614.023
[310]	valid_0's l1: 614.002
[311]	valid_0's l1: 613.49
[312]	valid_0's l1: 613.344
[313]	valid_0's l1: 612.88
[314]	valid_0's l1: 612.827
[315]	valid_0's l1: 612.696
[316]	valid_0's l1: 612.418
[317]	valid_0's l1: 612.061
[318]	valid_0's l1: 611.664
[319]	valid_0's l1: 611.339
[320]	valid_0's l1: 611.232
[321]	valid_0's l1: 611.046
[322]	valid_0's l1: 610.959
[323]	valid_0's l1: 610.846
[324]	valid_0's l1: 610.568
[325]	valid_0's l1: 610.371
[326]	valid_0's l1: 609.955
[327]	valid_0's l1: 609.666
[328]	valid_0's l1: 609.281
[329]	valid_0's l1: 609.187
[330]	valid_0's l1: 608.931
[331]	valid_0's l1: 608.643
[332]	valid_0's l1: 608.412
[333]	valid_0's l1: 608.275
[334]	valid_0's l1: 608.112
[335]	valid_0's l1: 607

[594]	valid_0's l1: 574.031
[595]	valid_0's l1: 573.988
[596]	valid_0's l1: 573.953
[597]	valid_0's l1: 573.93
[598]	valid_0's l1: 573.746
[599]	valid_0's l1: 573.693
[600]	valid_0's l1: 573.558
[601]	valid_0's l1: 573.476
[602]	valid_0's l1: 573.425
[603]	valid_0's l1: 573.289
[604]	valid_0's l1: 573.272
[605]	valid_0's l1: 573.263
[606]	valid_0's l1: 573.232
[607]	valid_0's l1: 573.185
[608]	valid_0's l1: 573.157
[609]	valid_0's l1: 573.12
[610]	valid_0's l1: 573.115
[611]	valid_0's l1: 573.099
[612]	valid_0's l1: 572.837
[613]	valid_0's l1: 572.787
[614]	valid_0's l1: 572.752
[615]	valid_0's l1: 572.73
[616]	valid_0's l1: 572.683
[617]	valid_0's l1: 572.417
[618]	valid_0's l1: 572.265
[619]	valid_0's l1: 572.236
[620]	valid_0's l1: 572.214
[621]	valid_0's l1: 572.123
[622]	valid_0's l1: 571.941
[623]	valid_0's l1: 571.824
[624]	valid_0's l1: 571.759
[625]	valid_0's l1: 571.743
[626]	valid_0's l1: 571.652
[627]	valid_0's l1: 571.528
[628]	valid_0's l1: 571.235
[629]	valid_0's l1: 571

[900]	valid_0's l1: 559.323
[901]	valid_0's l1: 559.251
[902]	valid_0's l1: 559.182
[903]	valid_0's l1: 559.196
[904]	valid_0's l1: 559.079
[905]	valid_0's l1: 559.056
[906]	valid_0's l1: 558.964
[907]	valid_0's l1: 558.927
[908]	valid_0's l1: 558.85
[909]	valid_0's l1: 558.86
[910]	valid_0's l1: 558.872
[911]	valid_0's l1: 558.856
[912]	valid_0's l1: 558.858
[913]	valid_0's l1: 558.849
[914]	valid_0's l1: 558.843
[915]	valid_0's l1: 558.768
[916]	valid_0's l1: 558.739
[917]	valid_0's l1: 558.711
[918]	valid_0's l1: 558.694
[919]	valid_0's l1: 558.691
[920]	valid_0's l1: 558.643
[921]	valid_0's l1: 558.646
[922]	valid_0's l1: 558.639
[923]	valid_0's l1: 558.638
[924]	valid_0's l1: 558.538
[925]	valid_0's l1: 558.507
[926]	valid_0's l1: 558.436
[927]	valid_0's l1: 558.263
[928]	valid_0's l1: 558.22
[929]	valid_0's l1: 558.213
[930]	valid_0's l1: 558.212
[931]	valid_0's l1: 558.199
[932]	valid_0's l1: 558.156
[933]	valid_0's l1: 558.145
[934]	valid_0's l1: 557.916
[935]	valid_0's l1: 557

[1203]	valid_0's l1: 549.809
[1204]	valid_0's l1: 549.8
[1205]	valid_0's l1: 549.778
[1206]	valid_0's l1: 549.648
[1207]	valid_0's l1: 549.645
[1208]	valid_0's l1: 549.618
[1209]	valid_0's l1: 549.606
[1210]	valid_0's l1: 549.554
[1211]	valid_0's l1: 549.543
[1212]	valid_0's l1: 549.539
[1213]	valid_0's l1: 549.529
[1214]	valid_0's l1: 549.516
[1215]	valid_0's l1: 549.423
[1216]	valid_0's l1: 549.404
[1217]	valid_0's l1: 549.417
[1218]	valid_0's l1: 549.328
[1219]	valid_0's l1: 549.328
[1220]	valid_0's l1: 549.306
[1221]	valid_0's l1: 549.287
[1222]	valid_0's l1: 549.249
[1223]	valid_0's l1: 549.238
[1224]	valid_0's l1: 549.231
[1225]	valid_0's l1: 549.243
[1226]	valid_0's l1: 549.188
[1227]	valid_0's l1: 549.189
[1228]	valid_0's l1: 549.194
[1229]	valid_0's l1: 549.189
[1230]	valid_0's l1: 549.178
[1231]	valid_0's l1: 549.176
[1232]	valid_0's l1: 549.05
[1233]	valid_0's l1: 549.041
[1234]	valid_0's l1: 549.045
[1235]	valid_0's l1: 549.017
[1236]	valid_0's l1: 548.986
[1237]	valid_0's 

[1502]	valid_0's l1: 544.357
[1503]	valid_0's l1: 544.263
[1504]	valid_0's l1: 544.233
[1505]	valid_0's l1: 544.162
[1506]	valid_0's l1: 544.154
[1507]	valid_0's l1: 544.08
[1508]	valid_0's l1: 544.075
[1509]	valid_0's l1: 544.073
[1510]	valid_0's l1: 544.034
[1511]	valid_0's l1: 543.889
[1512]	valid_0's l1: 543.883
[1513]	valid_0's l1: 543.845
[1514]	valid_0's l1: 543.84
[1515]	valid_0's l1: 543.798
[1516]	valid_0's l1: 543.774
[1517]	valid_0's l1: 543.765
[1518]	valid_0's l1: 543.744
[1519]	valid_0's l1: 543.73
[1520]	valid_0's l1: 543.74
[1521]	valid_0's l1: 543.747
[1522]	valid_0's l1: 543.748
[1523]	valid_0's l1: 543.749
[1524]	valid_0's l1: 543.729
[1525]	valid_0's l1: 543.64
[1526]	valid_0's l1: 543.625
[1527]	valid_0's l1: 543.611
[1528]	valid_0's l1: 543.601
[1529]	valid_0's l1: 543.601
[1530]	valid_0's l1: 543.597
[1531]	valid_0's l1: 543.597
[1532]	valid_0's l1: 543.643
[1533]	valid_0's l1: 543.63
[1534]	valid_0's l1: 543.619
[1535]	valid_0's l1: 543.589
[1536]	valid_0's l1:

[1787]	valid_0's l1: 539.527
[1788]	valid_0's l1: 539.476
[1789]	valid_0's l1: 539.475
[1790]	valid_0's l1: 539.475
[1791]	valid_0's l1: 539.472
[1792]	valid_0's l1: 539.458
[1793]	valid_0's l1: 539.458
[1794]	valid_0's l1: 539.477
[1795]	valid_0's l1: 539.477
[1796]	valid_0's l1: 539.497
[1797]	valid_0's l1: 539.439
[1798]	valid_0's l1: 539.373
[1799]	valid_0's l1: 539.355
[1800]	valid_0's l1: 539.325
[1801]	valid_0's l1: 539.312
[1802]	valid_0's l1: 539.35
[1803]	valid_0's l1: 539.272
[1804]	valid_0's l1: 539.247
[1805]	valid_0's l1: 539.245
[1806]	valid_0's l1: 539.192
[1807]	valid_0's l1: 539.167
[1808]	valid_0's l1: 539.175
[1809]	valid_0's l1: 539.162
[1810]	valid_0's l1: 539.133
[1811]	valid_0's l1: 539.079
[1812]	valid_0's l1: 539.056
[1813]	valid_0's l1: 539.022
[1814]	valid_0's l1: 539.011
[1815]	valid_0's l1: 538.965
[1816]	valid_0's l1: 538.974
[1817]	valid_0's l1: 538.971
[1818]	valid_0's l1: 538.969
[1819]	valid_0's l1: 538.935
[1820]	valid_0's l1: 538.941
[1821]	valid_0'

[2075]	valid_0's l1: 537.216
[2076]	valid_0's l1: 537.209
[2077]	valid_0's l1: 537.206
[2078]	valid_0's l1: 537.186
[2079]	valid_0's l1: 537.133
[2080]	valid_0's l1: 537.132
[2081]	valid_0's l1: 537.128
[2082]	valid_0's l1: 537.128
[2083]	valid_0's l1: 537.1
[2084]	valid_0's l1: 537.075
[2085]	valid_0's l1: 537.069
[2086]	valid_0's l1: 537.082
[2087]	valid_0's l1: 537.073
[2088]	valid_0's l1: 537.07
[2089]	valid_0's l1: 537.007
[2090]	valid_0's l1: 537.007
[2091]	valid_0's l1: 537.006
[2092]	valid_0's l1: 536.984
[2093]	valid_0's l1: 536.989
[2094]	valid_0's l1: 536.998
[2095]	valid_0's l1: 536.97
[2096]	valid_0's l1: 536.961
[2097]	valid_0's l1: 536.958
[2098]	valid_0's l1: 536.949
[2099]	valid_0's l1: 536.953
[2100]	valid_0's l1: 536.929
[2101]	valid_0's l1: 536.93
[2102]	valid_0's l1: 536.932
[2103]	valid_0's l1: 536.932
[2104]	valid_0's l1: 536.932
[2105]	valid_0's l1: 536.852
[2106]	valid_0's l1: 536.834
[2107]	valid_0's l1: 536.815
[2108]	valid_0's l1: 536.803
[2109]	valid_0's l1

[2373]	valid_0's l1: 534.418
[2374]	valid_0's l1: 534.4
[2375]	valid_0's l1: 534.396
[2376]	valid_0's l1: 534.381
[2377]	valid_0's l1: 534.369
[2378]	valid_0's l1: 534.369
[2379]	valid_0's l1: 534.4
[2380]	valid_0's l1: 534.388
[2381]	valid_0's l1: 534.384
[2382]	valid_0's l1: 534.386
[2383]	valid_0's l1: 534.381
[2384]	valid_0's l1: 534.366
[2385]	valid_0's l1: 534.363
[2386]	valid_0's l1: 534.361
[2387]	valid_0's l1: 534.369
[2388]	valid_0's l1: 534.371
[2389]	valid_0's l1: 534.371
[2390]	valid_0's l1: 534.355
[2391]	valid_0's l1: 534.313
[2392]	valid_0's l1: 534.305
[2393]	valid_0's l1: 534.304
[2394]	valid_0's l1: 534.284
[2395]	valid_0's l1: 534.286
[2396]	valid_0's l1: 534.281
[2397]	valid_0's l1: 534.281
[2398]	valid_0's l1: 534.282
[2399]	valid_0's l1: 534.268
[2400]	valid_0's l1: 534.263
[2401]	valid_0's l1: 534.248
[2402]	valid_0's l1: 534.254
[2403]	valid_0's l1: 534.242
[2404]	valid_0's l1: 534.24
[2405]	valid_0's l1: 534.24
[2406]	valid_0's l1: 534.266
[2407]	valid_0's l1:

[2672]	valid_0's l1: 532.188
[2673]	valid_0's l1: 532.193
[2674]	valid_0's l1: 532.199
[2675]	valid_0's l1: 532.198
[2676]	valid_0's l1: 532.198
[2677]	valid_0's l1: 532.197
[2678]	valid_0's l1: 532.196
[2679]	valid_0's l1: 532.193
[2680]	valid_0's l1: 532.167
[2681]	valid_0's l1: 532.142
[2682]	valid_0's l1: 532.129
[2683]	valid_0's l1: 532.119
[2684]	valid_0's l1: 532.131
[2685]	valid_0's l1: 532.132
[2686]	valid_0's l1: 532.123
[2687]	valid_0's l1: 532.116
[2688]	valid_0's l1: 532.111
[2689]	valid_0's l1: 532.022
[2690]	valid_0's l1: 532.03
[2691]	valid_0's l1: 532.019
[2692]	valid_0's l1: 532.015
[2693]	valid_0's l1: 532.01
[2694]	valid_0's l1: 531.961
[2695]	valid_0's l1: 531.963
[2696]	valid_0's l1: 531.963
[2697]	valid_0's l1: 531.963
[2698]	valid_0's l1: 531.94
[2699]	valid_0's l1: 531.918
[2700]	valid_0's l1: 531.918
[2701]	valid_0's l1: 531.911
[2702]	valid_0's l1: 531.912
[2703]	valid_0's l1: 531.908
[2704]	valid_0's l1: 531.903
[2705]	valid_0's l1: 531.895
[2706]	valid_0's 

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=10000, n_jobs=4, num_leaves=31, objective='mae',
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [62]:
# from category_encoders import CountEncoder, TargetEncoder, CatBoostEncoder
# te = TargetEncoder(cat_cols)
# te.fit(X_train[cat_cols], np.log1p(y_train))
# X_train = X_train.join(te.transform(X_train[cat_cols]).add_suffix('_target'))
# X_valid = X_valid.join(te.transform(X_valid[cat_cols]).add_suffix('_target'))
# X_train.shape

(135000, 37)

In [95]:
cb = CatBoostEncoder(cols=cat_cols)
cb.fit(X_train[cat_cols], np.log1p(y_train))
X_train = X_train.join(cb.transform(X_train[cat_cols]).add_suffix('_cb'))
X_valid = X_valid.join(cb.transform(X_valid[cat_cols]).add_suffix('_cb'))
X_train.shape

(135000, 37)

In [74]:
%%time
# 1. Dealing with missing values ALL by imputation:
# - Early stopping, best iteration is:
# - [4876]	valid_0's l1: 525.418

# AND 

# 2. Transform y	logy:
# - (no cat encoder) Validation result:
# - train set mae on training set is 371.64473550917234
# - valid set mae on validation set is 514.9241025521088

# - (target encoder) Early stopping, best iteration is:
# - [3555]	valid_0's l1: 0.118454
# - Validation result:
# - train set mae on training set is 389.8088560680232
# - valid set mae on validation set is 519.382108074699

# - (target AND catboost encoder) Early stopping, best iteration is:
# - [3569]	valid_0's l1: 0.117099
# - Validation result:
# - train set mae on training set is 382.02831982713275
# - valid set mae on validation set is 514.0273956213576

# - (catboost encoder)Early stopping, best iteration is:
# - [3480]	valid_0's l1: 0.117716
# - Validation result:
# - train set mae on training set is 393.25601736194045
# - valid set mae on validation set is 502.1343201900951

# AND 

# 3.Remove features with least importance (eli5 perm importance, next block)

# - (last 4) Early stopping, best iteration is:
# - [3907]	valid_0's l1: 0.117952
# - Validation result:
# - train set mae on training set is 384.368669315069
# - valid set mae on validation set is 513.7451488319449

# - (last 7) Early stopping, best iteration is:
# - [3611]	valid_0's l1: 0.118393
# - Validation result:
# - train set mae on training set is 390.1341714368112
# - valid set mae on validation set is 505.62113055930496

# - (last 9) Early stopping, best iteration is:
# - [3285]	valid_0's l1: 0.116864
# - Validation result:
# - train set mae on training set is 396.629580703306
# - valid set mae on validation set is 499.4044183471947

from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
cols = X_train.columns
X_train = pd.DataFrame(imputer.fit_transform(X_train))
X_train.columns = cols
X_valid = pd.DataFrame(imputer.transform(X_valid))
X_valid.columns = cols

imputed_lgb_model = LGBMRegressor(n_estimators=10000, 
                          n_jobs=4, 
                          random_state=42,
                          objective='mae')

imputed_lgb_model.fit(X_train, np.log1p(y_train),
                      eval_set=[(X_valid, np.log1p(y_valid))], 
                      eval_metric='mae', 
                      early_stopping_rounds=42)

validate(imputed_lgb_model, X_train, X_valid, y_train, y_valid)

[1]	valid_0's l1: 0.90799
Training until validation scores don't improve for 42 rounds
[2]	valid_0's l1: 0.832562
[3]	valid_0's l1: 0.76514
[4]	valid_0's l1: 0.704765
[5]	valid_0's l1: 0.650436
[6]	valid_0's l1: 0.601615
[7]	valid_0's l1: 0.558715
[8]	valid_0's l1: 0.520473
[9]	valid_0's l1: 0.486164
[10]	valid_0's l1: 0.456297
[11]	valid_0's l1: 0.429394
[12]	valid_0's l1: 0.405132
[13]	valid_0's l1: 0.383403
[14]	valid_0's l1: 0.362917
[15]	valid_0's l1: 0.344231
[16]	valid_0's l1: 0.328219
[17]	valid_0's l1: 0.313602
[18]	valid_0's l1: 0.300288
[19]	valid_0's l1: 0.287984
[20]	valid_0's l1: 0.277239
[21]	valid_0's l1: 0.267307
[22]	valid_0's l1: 0.257616
[23]	valid_0's l1: 0.249634
[24]	valid_0's l1: 0.243303
[25]	valid_0's l1: 0.23588
[26]	valid_0's l1: 0.229826
[27]	valid_0's l1: 0.223848
[28]	valid_0's l1: 0.218692
[29]	valid_0's l1: 0.213919
[30]	valid_0's l1: 0.209619
[31]	valid_0's l1: 0.205515
[32]	valid_0's l1: 0.201575
[33]	valid_0's l1: 0.198348
[34]	valid_0's l1: 0.195059

[288]	valid_0's l1: 0.131376
[289]	valid_0's l1: 0.131345
[290]	valid_0's l1: 0.131314
[291]	valid_0's l1: 0.131303
[292]	valid_0's l1: 0.131241
[293]	valid_0's l1: 0.131175
[294]	valid_0's l1: 0.131119
[295]	valid_0's l1: 0.131091
[296]	valid_0's l1: 0.131045
[297]	valid_0's l1: 0.130995
[298]	valid_0's l1: 0.130927
[299]	valid_0's l1: 0.13089
[300]	valid_0's l1: 0.130854
[301]	valid_0's l1: 0.130826
[302]	valid_0's l1: 0.130755
[303]	valid_0's l1: 0.130737
[304]	valid_0's l1: 0.13068
[305]	valid_0's l1: 0.130642
[306]	valid_0's l1: 0.130633
[307]	valid_0's l1: 0.130617
[308]	valid_0's l1: 0.130542
[309]	valid_0's l1: 0.130516
[310]	valid_0's l1: 0.130494
[311]	valid_0's l1: 0.130458
[312]	valid_0's l1: 0.130436
[313]	valid_0's l1: 0.130415
[314]	valid_0's l1: 0.13037
[315]	valid_0's l1: 0.130346
[316]	valid_0's l1: 0.130317
[317]	valid_0's l1: 0.130273
[318]	valid_0's l1: 0.130239
[319]	valid_0's l1: 0.130209
[320]	valid_0's l1: 0.130186
[321]	valid_0's l1: 0.130147
[322]	valid_0's l

[579]	valid_0's l1: 0.124996
[580]	valid_0's l1: 0.124979
[581]	valid_0's l1: 0.124966
[582]	valid_0's l1: 0.124938
[583]	valid_0's l1: 0.12494
[584]	valid_0's l1: 0.124931
[585]	valid_0's l1: 0.124927
[586]	valid_0's l1: 0.124898
[587]	valid_0's l1: 0.124885
[588]	valid_0's l1: 0.124885
[589]	valid_0's l1: 0.124876
[590]	valid_0's l1: 0.124871
[591]	valid_0's l1: 0.124871
[592]	valid_0's l1: 0.124853
[593]	valid_0's l1: 0.124832
[594]	valid_0's l1: 0.124826
[595]	valid_0's l1: 0.124815
[596]	valid_0's l1: 0.124795
[597]	valid_0's l1: 0.124786
[598]	valid_0's l1: 0.124759
[599]	valid_0's l1: 0.12475
[600]	valid_0's l1: 0.124737
[601]	valid_0's l1: 0.124729
[602]	valid_0's l1: 0.124718
[603]	valid_0's l1: 0.124706
[604]	valid_0's l1: 0.12471
[605]	valid_0's l1: 0.124703
[606]	valid_0's l1: 0.124699
[607]	valid_0's l1: 0.124698
[608]	valid_0's l1: 0.124692
[609]	valid_0's l1: 0.124678
[610]	valid_0's l1: 0.124661
[611]	valid_0's l1: 0.124656
[612]	valid_0's l1: 0.124636
[613]	valid_0's l

[867]	valid_0's l1: 0.122454
[868]	valid_0's l1: 0.12245
[869]	valid_0's l1: 0.122449
[870]	valid_0's l1: 0.122448
[871]	valid_0's l1: 0.122446
[872]	valid_0's l1: 0.122435
[873]	valid_0's l1: 0.122432
[874]	valid_0's l1: 0.122431
[875]	valid_0's l1: 0.122429
[876]	valid_0's l1: 0.122427
[877]	valid_0's l1: 0.122421
[878]	valid_0's l1: 0.122415
[879]	valid_0's l1: 0.122412
[880]	valid_0's l1: 0.122411
[881]	valid_0's l1: 0.122394
[882]	valid_0's l1: 0.122388
[883]	valid_0's l1: 0.122376
[884]	valid_0's l1: 0.122375
[885]	valid_0's l1: 0.122375
[886]	valid_0's l1: 0.122375
[887]	valid_0's l1: 0.122374
[888]	valid_0's l1: 0.122375
[889]	valid_0's l1: 0.122372
[890]	valid_0's l1: 0.122374
[891]	valid_0's l1: 0.122373
[892]	valid_0's l1: 0.122366
[893]	valid_0's l1: 0.122365
[894]	valid_0's l1: 0.122365
[895]	valid_0's l1: 0.122343
[896]	valid_0's l1: 0.122347
[897]	valid_0's l1: 0.122341
[898]	valid_0's l1: 0.122318
[899]	valid_0's l1: 0.122315
[900]	valid_0's l1: 0.122315
[901]	valid_0's

[1160]	valid_0's l1: 0.121101
[1161]	valid_0's l1: 0.121091
[1162]	valid_0's l1: 0.121075
[1163]	valid_0's l1: 0.121068
[1164]	valid_0's l1: 0.121058
[1165]	valid_0's l1: 0.121056
[1166]	valid_0's l1: 0.121048
[1167]	valid_0's l1: 0.121046
[1168]	valid_0's l1: 0.121038
[1169]	valid_0's l1: 0.121038
[1170]	valid_0's l1: 0.121029
[1171]	valid_0's l1: 0.121026
[1172]	valid_0's l1: 0.12102
[1173]	valid_0's l1: 0.121019
[1174]	valid_0's l1: 0.121008
[1175]	valid_0's l1: 0.121007
[1176]	valid_0's l1: 0.121004
[1177]	valid_0's l1: 0.121005
[1178]	valid_0's l1: 0.121007
[1179]	valid_0's l1: 0.121003
[1180]	valid_0's l1: 0.121001
[1181]	valid_0's l1: 0.120993
[1182]	valid_0's l1: 0.120991
[1183]	valid_0's l1: 0.120992
[1184]	valid_0's l1: 0.120988
[1185]	valid_0's l1: 0.120988
[1186]	valid_0's l1: 0.120989
[1187]	valid_0's l1: 0.12099
[1188]	valid_0's l1: 0.12099
[1189]	valid_0's l1: 0.12099
[1190]	valid_0's l1: 0.12099
[1191]	valid_0's l1: 0.12099
[1192]	valid_0's l1: 0.120987
[1193]	valid_0's

[1445]	valid_0's l1: 0.120289
[1446]	valid_0's l1: 0.120286
[1447]	valid_0's l1: 0.120288
[1448]	valid_0's l1: 0.120288
[1449]	valid_0's l1: 0.120288
[1450]	valid_0's l1: 0.120286
[1451]	valid_0's l1: 0.120285
[1452]	valid_0's l1: 0.120285
[1453]	valid_0's l1: 0.120284
[1454]	valid_0's l1: 0.120282
[1455]	valid_0's l1: 0.120276
[1456]	valid_0's l1: 0.120273
[1457]	valid_0's l1: 0.120267
[1458]	valid_0's l1: 0.120267
[1459]	valid_0's l1: 0.12027
[1460]	valid_0's l1: 0.120263
[1461]	valid_0's l1: 0.120261
[1462]	valid_0's l1: 0.120259
[1463]	valid_0's l1: 0.120257
[1464]	valid_0's l1: 0.120251
[1465]	valid_0's l1: 0.120251
[1466]	valid_0's l1: 0.12025
[1467]	valid_0's l1: 0.120247
[1468]	valid_0's l1: 0.120247
[1469]	valid_0's l1: 0.120248
[1470]	valid_0's l1: 0.120248
[1471]	valid_0's l1: 0.120245
[1472]	valid_0's l1: 0.120239
[1473]	valid_0's l1: 0.120236
[1474]	valid_0's l1: 0.120235
[1475]	valid_0's l1: 0.120231
[1476]	valid_0's l1: 0.120228
[1477]	valid_0's l1: 0.120228
[1478]	valid

[1724]	valid_0's l1: 0.119652
[1725]	valid_0's l1: 0.119664
[1726]	valid_0's l1: 0.119665
[1727]	valid_0's l1: 0.119663
[1728]	valid_0's l1: 0.119663
[1729]	valid_0's l1: 0.119663
[1730]	valid_0's l1: 0.119663
[1731]	valid_0's l1: 0.119663
[1732]	valid_0's l1: 0.119664
[1733]	valid_0's l1: 0.119652
[1734]	valid_0's l1: 0.119652
[1735]	valid_0's l1: 0.119648
[1736]	valid_0's l1: 0.119645
[1737]	valid_0's l1: 0.119631
[1738]	valid_0's l1: 0.119626
[1739]	valid_0's l1: 0.119624
[1740]	valid_0's l1: 0.119624
[1741]	valid_0's l1: 0.119621
[1742]	valid_0's l1: 0.119622
[1743]	valid_0's l1: 0.119623
[1744]	valid_0's l1: 0.119617
[1745]	valid_0's l1: 0.119616
[1746]	valid_0's l1: 0.119617
[1747]	valid_0's l1: 0.119616
[1748]	valid_0's l1: 0.119613
[1749]	valid_0's l1: 0.119607
[1750]	valid_0's l1: 0.119606
[1751]	valid_0's l1: 0.119607
[1752]	valid_0's l1: 0.119591
[1753]	valid_0's l1: 0.11959
[1754]	valid_0's l1: 0.119589
[1755]	valid_0's l1: 0.119587
[1756]	valid_0's l1: 0.119582
[1757]	vali

[1999]	valid_0's l1: 0.119152
[2000]	valid_0's l1: 0.119152
[2001]	valid_0's l1: 0.119152
[2002]	valid_0's l1: 0.119154
[2003]	valid_0's l1: 0.119149
[2004]	valid_0's l1: 0.119149
[2005]	valid_0's l1: 0.119149
[2006]	valid_0's l1: 0.119138
[2007]	valid_0's l1: 0.119123
[2008]	valid_0's l1: 0.119121
[2009]	valid_0's l1: 0.119122
[2010]	valid_0's l1: 0.119121
[2011]	valid_0's l1: 0.11912
[2012]	valid_0's l1: 0.119119
[2013]	valid_0's l1: 0.11912
[2014]	valid_0's l1: 0.119119
[2015]	valid_0's l1: 0.119121
[2016]	valid_0's l1: 0.119117
[2017]	valid_0's l1: 0.119117
[2018]	valid_0's l1: 0.119118
[2019]	valid_0's l1: 0.119119
[2020]	valid_0's l1: 0.11912
[2021]	valid_0's l1: 0.119118
[2022]	valid_0's l1: 0.119117
[2023]	valid_0's l1: 0.119117
[2024]	valid_0's l1: 0.119116
[2025]	valid_0's l1: 0.119113
[2026]	valid_0's l1: 0.119113
[2027]	valid_0's l1: 0.119109
[2028]	valid_0's l1: 0.119108
[2029]	valid_0's l1: 0.119107
[2030]	valid_0's l1: 0.119105
[2031]	valid_0's l1: 0.119105
[2032]	valid_

[2276]	valid_0's l1: 0.118701
[2277]	valid_0's l1: 0.118702
[2278]	valid_0's l1: 0.118703
[2279]	valid_0's l1: 0.118702
[2280]	valid_0's l1: 0.118704
[2281]	valid_0's l1: 0.118704
[2282]	valid_0's l1: 0.118702
[2283]	valid_0's l1: 0.1187
[2284]	valid_0's l1: 0.1187
[2285]	valid_0's l1: 0.118696
[2286]	valid_0's l1: 0.118694
[2287]	valid_0's l1: 0.118693
[2288]	valid_0's l1: 0.11869
[2289]	valid_0's l1: 0.118689
[2290]	valid_0's l1: 0.118691
[2291]	valid_0's l1: 0.118691
[2292]	valid_0's l1: 0.11869
[2293]	valid_0's l1: 0.118689
[2294]	valid_0's l1: 0.118692
[2295]	valid_0's l1: 0.118692
[2296]	valid_0's l1: 0.118693
[2297]	valid_0's l1: 0.118692
[2298]	valid_0's l1: 0.118695
[2299]	valid_0's l1: 0.118692
[2300]	valid_0's l1: 0.118692
[2301]	valid_0's l1: 0.118685
[2302]	valid_0's l1: 0.118687
[2303]	valid_0's l1: 0.118685
[2304]	valid_0's l1: 0.118685
[2305]	valid_0's l1: 0.118686
[2306]	valid_0's l1: 0.118683
[2307]	valid_0's l1: 0.118681
[2308]	valid_0's l1: 0.118681
[2309]	valid_0's

[2560]	valid_0's l1: 0.118332
[2561]	valid_0's l1: 0.11833
[2562]	valid_0's l1: 0.118333
[2563]	valid_0's l1: 0.118331
[2564]	valid_0's l1: 0.11833
[2565]	valid_0's l1: 0.11833
[2566]	valid_0's l1: 0.118329
[2567]	valid_0's l1: 0.118329
[2568]	valid_0's l1: 0.118327
[2569]	valid_0's l1: 0.118326
[2570]	valid_0's l1: 0.118324
[2571]	valid_0's l1: 0.118324
[2572]	valid_0's l1: 0.118323
[2573]	valid_0's l1: 0.118321
[2574]	valid_0's l1: 0.118319
[2575]	valid_0's l1: 0.118317
[2576]	valid_0's l1: 0.118316
[2577]	valid_0's l1: 0.118314
[2578]	valid_0's l1: 0.118314
[2579]	valid_0's l1: 0.118313
[2580]	valid_0's l1: 0.118312
[2581]	valid_0's l1: 0.118307
[2582]	valid_0's l1: 0.118305
[2583]	valid_0's l1: 0.118305
[2584]	valid_0's l1: 0.118306
[2585]	valid_0's l1: 0.118305
[2586]	valid_0's l1: 0.118304
[2587]	valid_0's l1: 0.118303
[2588]	valid_0's l1: 0.1183
[2589]	valid_0's l1: 0.118299
[2590]	valid_0's l1: 0.118298
[2591]	valid_0's l1: 0.118298
[2592]	valid_0's l1: 0.118298
[2593]	valid_0'

[2842]	valid_0's l1: 0.118103
[2843]	valid_0's l1: 0.118099
[2844]	valid_0's l1: 0.118106
[2845]	valid_0's l1: 0.118105
[2846]	valid_0's l1: 0.118104
[2847]	valid_0's l1: 0.118105
[2848]	valid_0's l1: 0.118105
[2849]	valid_0's l1: 0.118104
[2850]	valid_0's l1: 0.118105
[2851]	valid_0's l1: 0.118105
[2852]	valid_0's l1: 0.118109
[2853]	valid_0's l1: 0.118109
[2854]	valid_0's l1: 0.118108
[2855]	valid_0's l1: 0.118108
[2856]	valid_0's l1: 0.118108
[2857]	valid_0's l1: 0.118109
[2858]	valid_0's l1: 0.118109
[2859]	valid_0's l1: 0.118109
[2860]	valid_0's l1: 0.118109
[2861]	valid_0's l1: 0.11811
[2862]	valid_0's l1: 0.11811
[2863]	valid_0's l1: 0.11811
[2864]	valid_0's l1: 0.11811
[2865]	valid_0's l1: 0.11811
[2866]	valid_0's l1: 0.11811
[2867]	valid_0's l1: 0.11811
[2868]	valid_0's l1: 0.11811
[2869]	valid_0's l1: 0.118107
[2870]	valid_0's l1: 0.118107
[2871]	valid_0's l1: 0.118108
[2872]	valid_0's l1: 0.118107
[2873]	valid_0's l1: 0.118108
[2874]	valid_0's l1: 0.118107
[2875]	valid_0's l

[3124]	valid_0's l1: 0.117939
[3125]	valid_0's l1: 0.117939
[3126]	valid_0's l1: 0.117938
[3127]	valid_0's l1: 0.117939
[3128]	valid_0's l1: 0.117946
[3129]	valid_0's l1: 0.117946
[3130]	valid_0's l1: 0.117945
[3131]	valid_0's l1: 0.117944
[3132]	valid_0's l1: 0.117944
[3133]	valid_0's l1: 0.117943
[3134]	valid_0's l1: 0.117942
[3135]	valid_0's l1: 0.117942
[3136]	valid_0's l1: 0.11794
[3137]	valid_0's l1: 0.117939
[3138]	valid_0's l1: 0.117938
[3139]	valid_0's l1: 0.11794
[3140]	valid_0's l1: 0.11794
[3141]	valid_0's l1: 0.11794
[3142]	valid_0's l1: 0.11794
[3143]	valid_0's l1: 0.117937
[3144]	valid_0's l1: 0.117936
[3145]	valid_0's l1: 0.117937
[3146]	valid_0's l1: 0.117937
[3147]	valid_0's l1: 0.117936
[3148]	valid_0's l1: 0.117936
[3149]	valid_0's l1: 0.117936
[3150]	valid_0's l1: 0.117935
[3151]	valid_0's l1: 0.117935
[3152]	valid_0's l1: 0.117934
[3153]	valid_0's l1: 0.117937
[3154]	valid_0's l1: 0.117937
[3155]	valid_0's l1: 0.117937
[3156]	valid_0's l1: 0.117936
[3157]	valid_0'

[3400]	valid_0's l1: 0.117767
[3401]	valid_0's l1: 0.117771
[3402]	valid_0's l1: 0.11777
[3403]	valid_0's l1: 0.11777
[3404]	valid_0's l1: 0.11777
[3405]	valid_0's l1: 0.117762
[3406]	valid_0's l1: 0.117761
[3407]	valid_0's l1: 0.117758
[3408]	valid_0's l1: 0.117757
[3409]	valid_0's l1: 0.117758
[3410]	valid_0's l1: 0.117756
[3411]	valid_0's l1: 0.117755
[3412]	valid_0's l1: 0.117752
[3413]	valid_0's l1: 0.11775
[3414]	valid_0's l1: 0.117751
[3415]	valid_0's l1: 0.117748
[3416]	valid_0's l1: 0.117748
[3417]	valid_0's l1: 0.117746
[3418]	valid_0's l1: 0.117746
[3419]	valid_0's l1: 0.117745
[3420]	valid_0's l1: 0.117745
[3421]	valid_0's l1: 0.117744
[3422]	valid_0's l1: 0.117744
[3423]	valid_0's l1: 0.117744
[3424]	valid_0's l1: 0.117742
[3425]	valid_0's l1: 0.117741
[3426]	valid_0's l1: 0.117744
[3427]	valid_0's l1: 0.117743
[3428]	valid_0's l1: 0.117743
[3429]	valid_0's l1: 0.117743
[3430]	valid_0's l1: 0.117743
[3431]	valid_0's l1: 0.117743
[3432]	valid_0's l1: 0.117743
[3433]	valid_0

In [75]:
%%time
import eli5
from eli5.sklearn import PermutationImportance
y_valid_log = np.log1p(y_valid)
perm = PermutationImportance(imputed_lgb_model, random_state=42).fit(X_valid, y_valid_log)
eli5.show_weights(perm, feature_names=X_valid.columns.tolist(), top=40)



CPU times: user 29min 28s, sys: 6.86 s, total: 29min 35s
Wall time: 8min 12s


Weight,Feature
0.4060  ± 0.0122,v_3
0.2170  ± 0.0043,v_12
0.0807  ± 0.0023,v_0
0.0785  ± 0.0022,regDate_year
0.0506  ± 0.0004,v_10
0.0125  ± 0.0007,v_8
0.0116  ± 0.0010,kilometer
0.0093  ± 0.0010,power
0.0088  ± 0.0010,notRepairedDamage
0.0084  ± 0.0007,v_6


In [101]:
X_train.columns

Index(['name', 'model', 'brand', 'bodyType', 'fuelType', 'gearbox', 'power',
       'kilometer', 'notRepairedDamage', 'regionCode', 'v_0', 'v_1', 'v_2',
       'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12',
       'v_13', 'v_14', 'regDate_month', 'regDate_day', 'regDate_year',
       'creatDate_month', 'creatDate_day', 'creatDate_year', 'model_cb',
       'brand_cb', 'bodyType_cb', 'fuelType_cb', 'gearbox_cb',
       'notRepairedDamage_cb'],
      dtype='object')

In [103]:
#remove outliers (which are not shown in test data analized from my another EDA task)
X_train = X_train.loc[(X_train.v_12<=13)&(X_train.v_13<=6)&(X_train.v_14<=4)]
y_train = y_train[X_train.index]
X_valid = X_valid.loc[(X_valid.v_12<=13)&(X_valid.v_13<=6)&(X_valid.v_14<=4)]
y_valid = y_valid[X_valid.index]
low_imps_cols = ['bodyType_cb','gearbox','bodyType','fuelType','regDate_month',
                 'fuelType_cb','regionCode','gearbox_cb','creatDate_day',
                 'creatDate_month', 'creatDate_year','notRepairedDamage_cb', 
                 'regDate_day']
# low_imps_cols = ['fuelType','name','regDate_day', 'regDate_month', 'regionCode', 'gearbox', 'creatDate_month','creatDate_year','creatDate_day']
X_train = X_train.drop(low_imps_cols, axis=1)
X_valid = X_valid.drop(low_imps_cols, axis=1)
cols = X_train.columns

imputer_selected = SimpleImputer()
X_train = pd.DataFrame(imputer_selected.fit_transform(X_train))
X_train.columns = cols
X_valid = pd.DataFrame(imputer_selected.transform(X_valid))
X_valid.columns = cols


In [None]:

imputed_selected_lgb_model = LGBMRegressor(n_estimators=10000, 
                          n_jobs=4, 
                          random_state=42,
                          objective='mae')

imputed_selected_lgb_model.fit(X_train, np.log1p(y_train),
                      eval_set=[(X_valid, np.log1p(y_valid))], 
                      eval_metric='mae', 
                      early_stopping_rounds=42)

validate(imputed_selected_lgb_model, X_train, X_valid, y_train, y_valid)

[1]	valid_0's l1: 0.909089
Training until validation scores don't improve for 42 rounds
[2]	valid_0's l1: 0.833597
[3]	valid_0's l1: 0.765122
[4]	valid_0's l1: 0.704179
[5]	valid_0's l1: 0.649217
[6]	valid_0's l1: 0.599862
[7]	valid_0's l1: 0.557838
[8]	valid_0's l1: 0.519135
[9]	valid_0's l1: 0.485176
[10]	valid_0's l1: 0.455091
[11]	valid_0's l1: 0.427644
[12]	valid_0's l1: 0.403063
[13]	valid_0's l1: 0.38069
[14]	valid_0's l1: 0.360512
[15]	valid_0's l1: 0.343042
[16]	valid_0's l1: 0.326631
[17]	valid_0's l1: 0.311996
[18]	valid_0's l1: 0.299363
[19]	valid_0's l1: 0.287594
[20]	valid_0's l1: 0.276012
[21]	valid_0's l1: 0.267144
[22]	valid_0's l1: 0.257512
[23]	valid_0's l1: 0.249581
[24]	valid_0's l1: 0.241996
[25]	valid_0's l1: 0.234811
[26]	valid_0's l1: 0.228036
[27]	valid_0's l1: 0.222876
[28]	valid_0's l1: 0.217931
[29]	valid_0's l1: 0.212898
[30]	valid_0's l1: 0.208619
[31]	valid_0's l1: 0.204704
[32]	valid_0's l1: 0.201174
[33]	valid_0's l1: 0.197959
[34]	valid_0's l1: 0.1953

[287]	valid_0's l1: 0.131894
[288]	valid_0's l1: 0.13186
[289]	valid_0's l1: 0.131846
[290]	valid_0's l1: 0.131828
[291]	valid_0's l1: 0.131784
[292]	valid_0's l1: 0.131728
[293]	valid_0's l1: 0.13169
[294]	valid_0's l1: 0.131643
[295]	valid_0's l1: 0.131612
[296]	valid_0's l1: 0.131575
[297]	valid_0's l1: 0.131561
[298]	valid_0's l1: 0.131534
[299]	valid_0's l1: 0.131493
[300]	valid_0's l1: 0.131465
[301]	valid_0's l1: 0.131449
[302]	valid_0's l1: 0.131416
[303]	valid_0's l1: 0.131388
[304]	valid_0's l1: 0.131367
[305]	valid_0's l1: 0.131356
[306]	valid_0's l1: 0.131339
[307]	valid_0's l1: 0.131282
[308]	valid_0's l1: 0.131227
[309]	valid_0's l1: 0.131194
[310]	valid_0's l1: 0.131143
[311]	valid_0's l1: 0.131108
[312]	valid_0's l1: 0.131046
[313]	valid_0's l1: 0.131022
[314]	valid_0's l1: 0.131016
[315]	valid_0's l1: 0.131002
[316]	valid_0's l1: 0.130977
[317]	valid_0's l1: 0.130922
[318]	valid_0's l1: 0.1309
[319]	valid_0's l1: 0.130844
[320]	valid_0's l1: 0.130803
[321]	valid_0's l1

[580]	valid_0's l1: 0.125853
[581]	valid_0's l1: 0.125852
[582]	valid_0's l1: 0.125846
[583]	valid_0's l1: 0.125841
[584]	valid_0's l1: 0.125831
[585]	valid_0's l1: 0.125825
[586]	valid_0's l1: 0.125823
[587]	valid_0's l1: 0.125813
[588]	valid_0's l1: 0.125807
[589]	valid_0's l1: 0.125775
[590]	valid_0's l1: 0.125755
[591]	valid_0's l1: 0.125753
[592]	valid_0's l1: 0.125751
[593]	valid_0's l1: 0.125716
[594]	valid_0's l1: 0.125714
[595]	valid_0's l1: 0.12571
[596]	valid_0's l1: 0.125707
[597]	valid_0's l1: 0.125675
[598]	valid_0's l1: 0.125664
[599]	valid_0's l1: 0.125656
[600]	valid_0's l1: 0.125646
[601]	valid_0's l1: 0.125635
[602]	valid_0's l1: 0.125622
[603]	valid_0's l1: 0.125619
[604]	valid_0's l1: 0.125615
[605]	valid_0's l1: 0.125579
[606]	valid_0's l1: 0.12557
[607]	valid_0's l1: 0.125559
[608]	valid_0's l1: 0.125533
[609]	valid_0's l1: 0.125519
[610]	valid_0's l1: 0.125511
[611]	valid_0's l1: 0.12549
[612]	valid_0's l1: 0.125484
[613]	valid_0's l1: 0.125462
[614]	valid_0's l

In [28]:
# final build
data_test = pd.read_csv('./data/used_car_testA_20200313_revised.csv', index_col='SaleID')
X_full = data.copy()
X_test = data_test.copy()
for col in date_cols:
    X_full[col] = pd.to_datetime(X_full[col], format='%Y%m%d',errors='coerce')
    X_full[col+'_month'] = X_full[col].dt.month
    X_full[col+'_day'] = X_full[col].dt.day
    X_full[col+'_year'] = X_full[col].dt.year
    
    X_test[col] = pd.to_datetime(X_test[col], format='%Y%m%d',errors='coerce')
    X_test[col+'_month'] = X_test[col].dt.month
    X_test[col+'_day'] = X_test[col].dt.day
    X_test[col+'_year'] = X_test[col].dt.year
    
X_full['notRepairedDamage'] = X_full['notRepairedDamage'].replace('-','NaN').astype('float64')
X_full.drop(bad_cols,inplace=True, axis=1)
X_full.drop(date_cols, inplace=True, axis=1)

X_test['notRepairedDamage'] = X_test['notRepairedDamage'].replace('-','NaN').astype('float64')
X_test.drop(bad_cols,inplace=True, axis=1)
X_test.drop(date_cols, inplace=True, axis=1)

In [29]:
X_full = X_full.drop(low_imps_cols, axis=1)
X_test = X_test.drop(low_imps_cols, axis=1)
cols = X_test.columns
y_full = X_full['price']
X_full.drop('price', inplace=True, axis=1)

final_imputer = SimpleImputer()
X_full = pd.DataFrame(final_imputer.fit_transform(X_full))
X_full.columns = cols
X_test = pd.DataFrame(final_imputer.transform(X_test))
X_test.columns = cols

baseline_model = LGBMRegressor(n_estimators=4450,
                              random_state=42,
                              n_jobs=4,
                              objective='mae')
baseline_model.fit(X_full, np.log1p(y_full))

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=4450, n_jobs=4, num_leaves=31, objective='mae',
              random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [31]:
preds = baseline_model.predict(X_test)
res = pd.DataFrame({'SaleID':X_test.index, 'price':np.exp(preds)})
res.to_csv('lgb_perm_log_baseline_submission.csv', index=False)