In [23]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from typing import Callable

In [24]:
data = pd.read_csv('data/train.csv', parse_dates=['Date'])
stores = pd.read_csv('data/store.csv')
store_features = stores[['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'Promo2']]

data = data.merge(store_features.set_index('Store'), on='Store', how='left')
data = data.loc[data.Open == 1].drop(columns=['Open'])
data.head(5)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2
0,1,5,2015-07-31,5263,555,1,0,1,c,a,1270.0,0
1,2,5,2015-07-31,6064,625,1,0,1,a,a,570.0,1
2,3,5,2015-07-31,8314,821,1,0,1,a,a,14130.0,1
3,4,5,2015-07-31,13995,1498,1,0,1,c,c,620.0,0
4,5,5,2015-07-31,4822,559,1,0,1,a,a,29910.0,0


In [25]:
test_border = '2015-06-17'
val_border = '2015-05-01'

test = data.loc[data.Date >= test_border].drop(columns=['Customers'])

In [26]:
data['mean_sales'] = data.groupby('Store').Sales.transform('mean')
data['std_sales'] = data.groupby('Store').Sales.transform('std')
data['max_sales'] = data.groupby('Store').Sales.transform('max')
data['min_sales'] = data.groupby('Store').Sales.transform('min')

data['mean_customers'] = data.groupby('Store').Customers.transform('mean')
data['std_customers'] = data.groupby('Store').Customers.transform('std')
data['max_customers'] = data.groupby('Store').Customers.transform('max')
data['min_customers'] = data.groupby('Store').Customers.transform('min')

data['month'] = data.Date.dt.month
data['months_from_start'] = 12 * (data.Date.dt.year - data.Date.min().year) +\
                            (data.Date.dt.month - data.Date.min().month)

data = data.drop(columns=['Customers'])

data[['DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'Promo2', 'month']] =\
data[['DayOfWeek', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'Promo2', 'month']].astype('category')

In [27]:
data = data.loc[data.Sales != 0]
validation = data.loc[(data.Date > val_border) & (data.Date < test_border)]
train = data.loc[data.Date <= val_border]

In [28]:
validation

Unnamed: 0,Store,DayOfWeek,Date,Sales,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,mean_sales,std_sales,max_sales,min_sales,mean_customers,std_customers,max_customers,min_customers,month,months_from_start
50175,1,2,2015-06-16,4852,1,0,0,c,a,1270.0,...,4759.096031,1012.106393,9528,2362,564.049936,93.707476,1130,298,6,29
50176,2,2,2015-06-16,6243,1,0,0,a,a,570.0,...,4953.900510,1610.149102,10682,1919,583.998724,155.374483,1164,230,6,29
50177,3,2,2015-06-16,9780,1,0,0,a,a,14130.0,...,6942.568678,2193.383804,15689,2936,750.077022,170.280290,1579,381,6,29
50178,4,2,2015-06-16,10671,1,0,0,c,c,620.0,...,9638.401786,1936.031881,17412,5869,1321.752551,198.347844,2216,856,6,29
50179,5,2,2015-06-16,6354,1,0,0,a,a,29910.0,...,4676.274711,1765.745628,11692,1423,537.340180,165.604598,1081,180,6,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101460,1111,6,2015-05-02,3528,0,0,0,a,a,1900.0,...,5251.702182,1667.623037,12492,1773,451.711168,117.418078,893,157,5,28
101461,1112,6,2015-05-02,9803,0,0,0,c,c,1880.0,...,10236.577664,3334.921787,25165,4895,838.608472,191.043702,1661,465,5,28
101462,1113,6,2015-05-02,8037,0,0,0,a,c,9260.0,...,6627.859694,1456.489302,16115,4050,717.029337,124.544792,1642,474,5,28
101463,1114,6,2015-05-02,31445,0,0,0,a,c,870.0,...,20666.562500,3452.938601,35697,8880,3200.946429,441.865226,4911,1160,5,28


## Custom evaluation metric for LightGBM
Competition uses RMSPE (Root Mean Squared Percentage Error) as a metric to evaluate submissions. Thus it makes sense to use it to train our model as well. Here's a simple implementation of custom LightGBM evaluation metric:

In [36]:
def make_feval(f: Callable, name: str = None, higher_better: bool = False) -> Callable:
    """
    Function factory to transform @f to @feval required by LightGBM
    Args:
        f: function of 2 arguments (predictions, true_values) -> score
        name: name of function (f.name will be used if None)
        higher_better: True if higher score is better, otherwise False
    Returns:
        feval: function of 2 arguments (predictions, Dataset with true labels) -> (name, score, higher_better)
    """

    def feval(X: np.ndarray, Y: lgb.Dataset):
        return name if name is not None else f.__name__,\
               f(X, Y.get_label()),\
               higher_better

    return feval

def RMSPE(X: np.ndarray, Y: np.ndarray):
    return np.sqrt(np.mean(np.square(((Y - X) / Y))))

In [37]:
eval_set = lgb.Dataset(validation.drop(columns=['Sales', 'Date']), validation['Sales'])
train_set = lgb.Dataset(train.drop(columns=['Sales', 'Date']), train['Sales'])

params = {
    'objective': 'poisson',
    'num_iterations': 5000,
    'learning_rate': 0.05,
    'early_stopping_rounds': 30
}

model = lgb.train(
    params,
    train_set=train_set,
    valid_sets=[eval_set],
    feval=make_feval(RMSPE)
)


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2617
[LightGBM] [Info] Number of data points in the train set: 759868, number of used features: 19
[LightGBM] [Info] Start training from score 8.843374
[1]	valid_0's poisson: -58522.8	valid_0's RMSPE: 0.511084
Training until validation scores don't improve for 30 rounds
[2]	valid_0's poisson: -58547.5	valid_0's RMSPE: 0.501647
[3]	valid_0's poisson: -58570.8	valid_0's RMSPE: 0.492626
[4]	valid_0's poisson: -58592.4	valid_0's RMSPE: 0.483729
[5]	valid_0's poisson: -58612.9	valid_0's RMSPE: 0.475074
[6]	valid_0's poisson: -58632	valid_0's RMSPE: 0.466697
[7]	valid_0's poisson: -58650	valid_0's RMSPE: 0.458571
[8]	valid_0's poisson: -58667.2	valid_0's RMSPE: 0.450569
[9]	valid_0's poisson: -58683.2	valid_0's RMSPE: 0.442849
[10]	valid_0's poisson: -58698.5	valid_0's RMSPE: 0.435343
[11]	valid_0's poisson: -58712.9	valid_0's RMSPE: 0.428069


[139]	valid_0's poisson: -59005	valid_0's RMSPE: 0.190353
[140]	valid_0's poisson: -59005.3	valid_0's RMSPE: 0.190232
[141]	valid_0's poisson: -59005.4	valid_0's RMSPE: 0.190131
[142]	valid_0's poisson: -59005.6	valid_0's RMSPE: 0.189983
[143]	valid_0's poisson: -59006.1	valid_0's RMSPE: 0.189097
[144]	valid_0's poisson: -59006.3	valid_0's RMSPE: 0.188842
[145]	valid_0's poisson: -59006.6	valid_0's RMSPE: 0.188256
[146]	valid_0's poisson: -59006.9	valid_0's RMSPE: 0.187904
[147]	valid_0's poisson: -59007.2	valid_0's RMSPE: 0.187397
[148]	valid_0's poisson: -59007.4	valid_0's RMSPE: 0.187072
[149]	valid_0's poisson: -59007.7	valid_0's RMSPE: 0.18661
[150]	valid_0's poisson: -59007.9	valid_0's RMSPE: 0.186376
[151]	valid_0's poisson: -59008.1	valid_0's RMSPE: 0.1863
[152]	valid_0's poisson: -59008.3	valid_0's RMSPE: 0.185878
[153]	valid_0's poisson: -59008.7	valid_0's RMSPE: 0.185253
[154]	valid_0's poisson: -59008.9	valid_0's RMSPE: 0.18495
[155]	valid_0's poisson: -59009.3	valid_0's RM

[286]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.172186
[287]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.172177
[288]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.17223
[289]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.172211
[290]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.17202
[291]	valid_0's poisson: -59020.6	valid_0's RMSPE: 0.172074
[292]	valid_0's poisson: -59020.7	valid_0's RMSPE: 0.172076
[293]	valid_0's poisson: -59020.7	valid_0's RMSPE: 0.172073
[294]	valid_0's poisson: -59020.7	valid_0's RMSPE: 0.171992
[295]	valid_0's poisson: -59020.7	valid_0's RMSPE: 0.171977
[296]	valid_0's poisson: -59020.8	valid_0's RMSPE: 0.171997
[297]	valid_0's poisson: -59020.8	valid_0's RMSPE: 0.171958
[298]	valid_0's poisson: -59020.8	valid_0's RMSPE: 0.171893
[299]	valid_0's poisson: -59020.9	valid_0's RMSPE: 0.171935
[300]	valid_0's poisson: -59020.9	valid_0's RMSPE: 0.171944
[301]	valid_0's poisson: -59020.9	valid_0's RMSPE: 0.172009
[302]	valid_0's poisson: -59021.1	valid_0'

[428]	valid_0's poisson: -59026.3	valid_0's RMSPE: 0.165478
[429]	valid_0's poisson: -59026.3	valid_0's RMSPE: 0.165479
[430]	valid_0's poisson: -59026.3	valid_0's RMSPE: 0.165461
[431]	valid_0's poisson: -59026.3	valid_0's RMSPE: 0.165473
[432]	valid_0's poisson: -59026.4	valid_0's RMSPE: 0.165328
[433]	valid_0's poisson: -59026.5	valid_0's RMSPE: 0.165313
[434]	valid_0's poisson: -59026.5	valid_0's RMSPE: 0.165305
[435]	valid_0's poisson: -59026.6	valid_0's RMSPE: 0.165079
[436]	valid_0's poisson: -59026.6	valid_0's RMSPE: 0.165054
[437]	valid_0's poisson: -59026.7	valid_0's RMSPE: 0.164922
[438]	valid_0's poisson: -59026.7	valid_0's RMSPE: 0.164893
[439]	valid_0's poisson: -59026.7	valid_0's RMSPE: 0.164874
[440]	valid_0's poisson: -59026.8	valid_0's RMSPE: 0.164757
[441]	valid_0's poisson: -59026.9	valid_0's RMSPE: 0.164717
[442]	valid_0's poisson: -59027	valid_0's RMSPE: 0.164595
[443]	valid_0's poisson: -59027	valid_0's RMSPE: 0.164475
[444]	valid_0's poisson: -59027.1	valid_0's 

[569]	valid_0's poisson: -59030.8	valid_0's RMSPE: 0.156325
[570]	valid_0's poisson: -59030.8	valid_0's RMSPE: 0.156315
[571]	valid_0's poisson: -59030.8	valid_0's RMSPE: 0.156303
[572]	valid_0's poisson: -59030.8	valid_0's RMSPE: 0.156292
[573]	valid_0's poisson: -59030.9	valid_0's RMSPE: 0.156181
[574]	valid_0's poisson: -59030.9	valid_0's RMSPE: 0.156009
[575]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155978
[576]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155979
[577]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155772
[578]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155773
[579]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155759
[580]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155755
[581]	valid_0's poisson: -59031	valid_0's RMSPE: 0.155728
[582]	valid_0's poisson: -59031.1	valid_0's RMSPE: 0.15573
[583]	valid_0's poisson: -59031.1	valid_0's RMSPE: 0.155731
[584]	valid_0's poisson: -59031.1	valid_0's RMSPE: 0.155712
[585]	valid_0's poisson: -59031.2	valid_0's RMSPE: 0.15

[713]	valid_0's poisson: -59033.5	valid_0's RMSPE: 0.151585
[714]	valid_0's poisson: -59033.5	valid_0's RMSPE: 0.151577
[715]	valid_0's poisson: -59033.5	valid_0's RMSPE: 0.151517
[716]	valid_0's poisson: -59033.5	valid_0's RMSPE: 0.151514
[717]	valid_0's poisson: -59033.5	valid_0's RMSPE: 0.151507
[718]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151464
[719]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151443
[720]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151435
[721]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151425
[722]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151426
[723]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151425
[724]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151423
[725]	valid_0's poisson: -59033.6	valid_0's RMSPE: 0.151366
[726]	valid_0's poisson: -59033.7	valid_0's RMSPE: 0.151212
[727]	valid_0's poisson: -59033.7	valid_0's RMSPE: 0.151155
[728]	valid_0's poisson: -59033.7	valid_0's RMSPE: 0.151145
[729]	valid_0's poisson: -59033.7	valid_

[866]	valid_0's poisson: -59035.4	valid_0's RMSPE: 0.144796
[867]	valid_0's poisson: -59035.4	valid_0's RMSPE: 0.144786
[868]	valid_0's poisson: -59035.4	valid_0's RMSPE: 0.144813
[869]	valid_0's poisson: -59035.4	valid_0's RMSPE: 0.144806
[870]	valid_0's poisson: -59035.4	valid_0's RMSPE: 0.144748
[871]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144675
[872]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144661
[873]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.14466
[874]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144632
[875]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144625
[876]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144615
[877]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144608
[878]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144592
[879]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144586
[880]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144574
[881]	valid_0's poisson: -59035.5	valid_0's RMSPE: 0.144529
[882]	valid_0's poisson: -59035.5	valid_0

[1014]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.14171
[1015]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141687
[1016]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141681
[1017]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141679
[1018]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141656
[1019]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141649
[1020]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141641
[1021]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141627
[1022]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141627
[1023]	valid_0's poisson: -59037.1	valid_0's RMSPE: 0.141627
[1024]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.141625
[1025]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.141626
[1026]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.14162
[1027]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.141618
[1028]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.141609
[1029]	valid_0's poisson: -59037.2	valid_0's RMSPE: 0.141612
[1030]	valid_0's poisson: 

[1164]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139913
[1165]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139908
[1166]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139908
[1167]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139887
[1168]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139884
[1169]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139875
[1170]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139875
[1171]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139865
[1172]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139857
[1173]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139804
[1174]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139804
[1175]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139796
[1176]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139798
[1177]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139794
[1178]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139789
[1179]	valid_0's poisson: -59038.3	valid_0's RMSPE: 0.139776
[1180]	valid_0's poisson

[1303]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.137721
[1304]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.137718
[1305]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137696
[1306]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137688
[1307]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137686
[1308]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137684
[1309]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137683
[1310]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137675
[1311]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137673
[1312]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137667
[1313]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137615
[1314]	valid_0's poisson: -59039.7	valid_0's RMSPE: 0.137609
[1315]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.137645
[1316]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.137644
[1317]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.137643
[1318]	valid_0's poisson: -59039.6	valid_0's RMSPE: 0.13764
[1319]	valid_0's poisson: