In [1]:
import sys
sys.path.append('..')
from src.utilities import *
from src.models.train_model import train_model
from src.models.param_opt import bayes_parameter_opt_lgb

## 3. Modelling

### 3.1 Load master table

In [2]:
master = pd.read_csv(os.path.join(processed_path, 'master.csv'))

### 3.2 Train / val / test split

In [3]:
x_train = master[(master.sales.isna()==False)].drop(columns = ['region', 'brand']).copy()
x_train = x_train[x_train.month >= '2020-06']
x_train['train'] =  (x_train.month <= '2021-06').astype(float)
x_train.drop(columns = 'month', inplace = True)

### 3.3 Parameter search

In [None]:
best_params = bayes_parameter_opt_lgb(X = x_train.drop(columns = ['sales', 'train']), y = x_train.sales,
                                      init_round=30, opt_round=30, n_folds=6, random_seed=6, n_estimators=10000, 
                                      learning_rate=0.01, save_path = '')
best_params = best_params.max['params']
best_params

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 | max_depth | min_ch... | min_sp... | num_le... |
-------------------------------------------------------------------------------------------------------------------------
[200]	cv_agg's rmse: 1392.59 + 137.435
[400]	cv_agg's rmse: 1146.75 + 111.98
[600]	cv_agg's rmse: 1079.98 + 100.309
[800]	cv_agg's rmse: 1057.77 + 90.0014
[1000]	cv_agg's rmse: 1049.37 + 85.6933
[1200]	cv_agg's rmse: 1045.32 + 80.7004
[1400]	cv_agg's rmse: 1043.47 + 77.9744
[1600]	cv_agg's rmse: 1040.66 + 76.6596
[1800]	cv_agg's rmse: 1037.68 + 73.1575
[2000]	cv_agg's rmse: 1035.72 + 71.6243
[2200]	cv_agg's rmse: 1035.01 + 69.9853
[2400]	cv_agg's rmse: 1034.19 + 68.4196
[2600]	cv_agg's rmse: 1033.72 + 67.6738
[2800]	cv_agg's rmse: 1032.85 + 67.6785
| [0m 1       [0m | [0m-1.033e+0[0m | [0m 0.8227  [0m | [0m 0.8796  [0m | [0m 3.644   [0m | [0m 1.054   [0m | [0m 14.2    [0m | [0m 40.98   [0m | [0m 0.06491 [0m | [0m 27.44   [0m |

[1600]	cv_agg's rmse: 1076.96 + 86.3981
[1800]	cv_agg's rmse: 1076.9 + 83.2295
| [0m 14      [0m | [0m-1.076e+0[0m | [0m 0.9617  [0m | [0m 0.7368  [0m | [0m 1.4     [0m | [0m 1.366   [0m | [0m 10.17   [0m | [0m 47.9    [0m | [0m 0.03562 [0m | [0m 16.78   [0m |
[200]	cv_agg's rmse: 1242.02 + 121.97
[400]	cv_agg's rmse: 1029.04 + 98.4327
[600]	cv_agg's rmse: 988.999 + 88.9103
[800]	cv_agg's rmse: 980.561 + 85.0841
[1000]	cv_agg's rmse: 977.995 + 81.9543
[1200]	cv_agg's rmse: 977.481 + 78.6087
| [95m 15      [0m | [95m-977.4   [0m | [95m 0.8007  [0m | [95m 0.5893  [0m | [95m 1.727   [0m | [95m 1.436   [0m | [95m 17.78   [0m | [95m 13.77   [0m | [95m 0.0331  [0m | [95m 44.3    [0m |
[200]	cv_agg's rmse: 1268.25 + 115.724
[400]	cv_agg's rmse: 1056.6 + 88.6757
[600]	cv_agg's rmse: 1010.52 + 82.2249
[800]	cv_agg's rmse: 996.15 + 79.2465
[1000]	cv_agg's rmse: 990.55 + 75.3792
[1200]	cv_agg's rmse: 987.972 + 71.7532
[1400]	cv_agg's rmse: 987.19 + 70.1825


[200]	cv_agg's rmse: 1457.68 + 152.949
[400]	cv_agg's rmse: 1215.18 + 131.259
[600]	cv_agg's rmse: 1123.78 + 113.757
[800]	cv_agg's rmse: 1087.19 + 106.931
[1000]	cv_agg's rmse: 1071.98 + 101.03
[1200]	cv_agg's rmse: 1062.35 + 95.5769
[1400]	cv_agg's rmse: 1055.97 + 91.8281
[1600]	cv_agg's rmse: 1052.07 + 87.9847
[1800]	cv_agg's rmse: 1049.44 + 85.2246
[2000]	cv_agg's rmse: 1048.01 + 81.9686
[2200]	cv_agg's rmse: 1046.06 + 78.2565
[2400]	cv_agg's rmse: 1045.06 + 74.882
[2600]	cv_agg's rmse: 1042.52 + 71.6362
[2800]	cv_agg's rmse: 1041.23 + 69.5271
[3000]	cv_agg's rmse: 1040.94 + 68.1356
| [0m 29      [0m | [0m-1.041e+0[0m | [0m 0.9838  [0m | [0m 0.6513  [0m | [0m 1.408   [0m | [0m 2.742   [0m | [0m 7.491   [0m | [0m 48.95   [0m | [0m 0.0524  [0m | [0m 36.96   [0m |
[200]	cv_agg's rmse: 1266.21 + 127.321
[400]	cv_agg's rmse: 1035.72 + 104.549
[600]	cv_agg's rmse: 987.328 + 93.7545
[800]	cv_agg's rmse: 977.126 + 89.4761
[1000]	cv_agg's rmse: 973.897 + 85.8642
[1200]	c

### 3.4 Model training

In [4]:
##### Define best parameters found earlier
best_params = {'metric': 'rmse',
               'bagging_fraction': 1,
               'feature_fraction': 0.9,
               'lambda_l1': 0,
               'lambda_l2': 0,
               'max_depth': 13,
               'min_child_weight': 7.720200312255985,
               'min_split_gain': 0.1,
               'num_leaves': 45}

##### Train a quantile regression lgb at different alpha levels
quantile_alphas = [0.2, 0.5, 0.8]
lgb_quantiles = {}

for alpha in quantile_alphas:
    current_model = train_model(x_train, target_name = 'sales', 
                                model_type = 'lgb-quantile', 
                                quantile_alpha = alpha,
                                params = best_params, 
                                metric = 'rmse', 
                                split = 'in_sample',
                                save_path = os.path.join(models_path, 
                                                         'model2_quantile_'+ str(alpha)+'.pkl'))
    lgb_quantiles[alpha] = current_model
    
##### Visualize feature contributions
feature_contributions = pd.DataFrame({'feature': x_train.drop(columns = ['sales', 'train']).columns, 
                                      'gain': lgb_quantiles[0.5].feature_importance(importance_type = 'gain'),
                                      'split': lgb_quantiles[0.5].feature_importance(importance_type = 'split')
                                     }).sort_values('gain', ascending = False)

feature_contributions[feature_contributions.gain>0][:40]

# 0.2 - 1775.72 1706.28
# 0.5 -  1018.65 1163.06
# 0.8 -  826.974  1127.6

####################     training with  4077      ####################
####################     validating with  453      ####################
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15356
[LightGBM] [Info] Number of data points in the train set: 4077, number of used features: 183
[LightGBM] [Info] Start training from score 78.755775
[1]	training's rmse: 3182.37	valid_1's rmse: 2974.98
Training until validation scores don't improve for 20 rounds
[2]	training's rmse: 3158.44	valid_1's rmse: 2950.44
[3]	training's rmse: 3131.95	valid_1's rmse: 2923.85
[4]	training's rmse: 3094.73	valid_1's rmse: 2886.73
[5]	training's rmse: 3054.75	valid_1's rmse: 2847.36
[6]	training's rmse: 3006.41	valid_1's rmse: 2799.04
[7]	training's rmse: 2973.01	valid_1's rmse: 2765.49
[8]	training's rmse: 2947.92	valid_1's rmse: 2740.58
[9]	training's rmse: 2923.02	valid_1's rmse: 2715.33
[10]	training's rmse: 2896.37	valid_1's rmse: 2688.94
[11]	training's rmse: 286

[142]	training's rmse: 1964.16	valid_1's rmse: 1840.94
[143]	training's rmse: 1961.58	valid_1's rmse: 1840.35
[144]	training's rmse: 1961.31	valid_1's rmse: 1840.28
[145]	training's rmse: 1960.42	valid_1's rmse: 1840.42
[146]	training's rmse: 1959.44	valid_1's rmse: 1839.35
[147]	training's rmse: 1959.36	valid_1's rmse: 1839.29
[148]	training's rmse: 1959.03	valid_1's rmse: 1839.08
[149]	training's rmse: 1958.88	valid_1's rmse: 1838.96
[150]	training's rmse: 1956.88	valid_1's rmse: 1838.24
[151]	training's rmse: 1955.84	valid_1's rmse: 1837.74
[152]	training's rmse: 1955.13	valid_1's rmse: 1837.57
[153]	training's rmse: 1955.08	valid_1's rmse: 1837.59
[154]	training's rmse: 1954.03	valid_1's rmse: 1835.68
[155]	training's rmse: 1953.02	valid_1's rmse: 1834.22
[156]	training's rmse: 1952.88	valid_1's rmse: 1834.05
[157]	training's rmse: 1951.54	valid_1's rmse: 1832.64
[158]	training's rmse: 1951.07	valid_1's rmse: 1831.99
[159]	training's rmse: 1950.97	valid_1's rmse: 1832.06
[160]	trai

[302]	training's rmse: 1881.09	valid_1's rmse: 1783.67
[303]	training's rmse: 1881.09	valid_1's rmse: 1783.67
[304]	training's rmse: 1881.1	valid_1's rmse: 1783.69
[305]	training's rmse: 1881	valid_1's rmse: 1783.61
[306]	training's rmse: 1881	valid_1's rmse: 1783.58
[307]	training's rmse: 1880.97	valid_1's rmse: 1783.55
[308]	training's rmse: 1880.93	valid_1's rmse: 1783.52
[309]	training's rmse: 1880.82	valid_1's rmse: 1783.51
[310]	training's rmse: 1880.48	valid_1's rmse: 1782.92
[311]	training's rmse: 1880.29	valid_1's rmse: 1782.68
[312]	training's rmse: 1879.68	valid_1's rmse: 1782.05
[313]	training's rmse: 1879.41	valid_1's rmse: 1781.84
[314]	training's rmse: 1879.39	valid_1's rmse: 1781.86
[315]	training's rmse: 1879.4	valid_1's rmse: 1781.88
[316]	training's rmse: 1879.04	valid_1's rmse: 1781.48
[317]	training's rmse: 1878.66	valid_1's rmse: 1781.06
[318]	training's rmse: 1878.51	valid_1's rmse: 1780.91
[319]	training's rmse: 1878.21	valid_1's rmse: 1780.73
[320]	training's r

[470]	training's rmse: 1833.99	valid_1's rmse: 1748.62
[471]	training's rmse: 1833.95	valid_1's rmse: 1748.59
[472]	training's rmse: 1833.91	valid_1's rmse: 1748.6
[473]	training's rmse: 1833.59	valid_1's rmse: 1748.42
[474]	training's rmse: 1833.24	valid_1's rmse: 1748.29
[475]	training's rmse: 1833.18	valid_1's rmse: 1748.32
[476]	training's rmse: 1833.18	valid_1's rmse: 1748.42
[477]	training's rmse: 1832.36	valid_1's rmse: 1748.16
[478]	training's rmse: 1832.22	valid_1's rmse: 1747.96
[479]	training's rmse: 1832.21	valid_1's rmse: 1747.96
[480]	training's rmse: 1831.55	valid_1's rmse: 1747.99
[481]	training's rmse: 1831.34	valid_1's rmse: 1747.97
[482]	training's rmse: 1831.31	valid_1's rmse: 1747.93
[483]	training's rmse: 1831.3	valid_1's rmse: 1747.89
[484]	training's rmse: 1831.2	valid_1's rmse: 1747.76
[485]	training's rmse: 1831.03	valid_1's rmse: 1747.52
[486]	training's rmse: 1831.02	valid_1's rmse: 1747.52
[487]	training's rmse: 1831.02	valid_1's rmse: 1747.51
[488]	trainin

[623]	training's rmse: 1811.52	valid_1's rmse: 1734.05
[624]	training's rmse: 1811.49	valid_1's rmse: 1734.06
[625]	training's rmse: 1811.5	valid_1's rmse: 1734.08
[626]	training's rmse: 1810.12	valid_1's rmse: 1733.09
[627]	training's rmse: 1810.08	valid_1's rmse: 1733.06
[628]	training's rmse: 1810.02	valid_1's rmse: 1732.95
[629]	training's rmse: 1810.02	valid_1's rmse: 1732.93
[630]	training's rmse: 1810	valid_1's rmse: 1732.91
[631]	training's rmse: 1809.98	valid_1's rmse: 1732.93
[632]	training's rmse: 1809.97	valid_1's rmse: 1732.93
[633]	training's rmse: 1809.6	valid_1's rmse: 1732.62
[634]	training's rmse: 1809.25	valid_1's rmse: 1732.28
[635]	training's rmse: 1809.2	valid_1's rmse: 1732.25
[636]	training's rmse: 1809.21	valid_1's rmse: 1732.25
[637]	training's rmse: 1809.2	valid_1's rmse: 1732.29
[638]	training's rmse: 1809.06	valid_1's rmse: 1731.96
[639]	training's rmse: 1808.5	valid_1's rmse: 1731.63
[640]	training's rmse: 1807.75	valid_1's rmse: 1731.08
[641]	training's r

[781]	training's rmse: 1795.62	valid_1's rmse: 1717.86
[782]	training's rmse: 1794.82	valid_1's rmse: 1717.87
[783]	training's rmse: 1794.81	valid_1's rmse: 1717.87
[784]	training's rmse: 1794.75	valid_1's rmse: 1717.84
[785]	training's rmse: 1794.76	valid_1's rmse: 1717.9
[786]	training's rmse: 1794.75	valid_1's rmse: 1717.9
[787]	training's rmse: 1794.7	valid_1's rmse: 1717.85
[788]	training's rmse: 1794.69	valid_1's rmse: 1717.85
[789]	training's rmse: 1794.66	valid_1's rmse: 1717.83
[790]	training's rmse: 1794.42	valid_1's rmse: 1717.46
[791]	training's rmse: 1794.41	valid_1's rmse: 1717.45
[792]	training's rmse: 1794.28	valid_1's rmse: 1717.25
[793]	training's rmse: 1794.2	valid_1's rmse: 1717.28
[794]	training's rmse: 1794.23	valid_1's rmse: 1717.32
[795]	training's rmse: 1794.18	valid_1's rmse: 1717.34
[796]	training's rmse: 1794.17	valid_1's rmse: 1717.32
[797]	training's rmse: 1793.22	valid_1's rmse: 1716.68
[798]	training's rmse: 1793.19	valid_1's rmse: 1716.54
[799]	training

[928]	training's rmse: 1782.61	valid_1's rmse: 1710.36
[929]	training's rmse: 1782.55	valid_1's rmse: 1710.43
[930]	training's rmse: 1782.55	valid_1's rmse: 1710.44
[931]	training's rmse: 1782.55	valid_1's rmse: 1710.46
[932]	training's rmse: 1782.4	valid_1's rmse: 1710.4
[933]	training's rmse: 1781.9	valid_1's rmse: 1710.26
[934]	training's rmse: 1781.89	valid_1's rmse: 1710.26
[935]	training's rmse: 1781.85	valid_1's rmse: 1710.26
[936]	training's rmse: 1781.85	valid_1's rmse: 1710.26
[937]	training's rmse: 1781.66	valid_1's rmse: 1710.26
[938]	training's rmse: 1781.61	valid_1's rmse: 1710.24
[939]	training's rmse: 1780.59	valid_1's rmse: 1710.08
[940]	training's rmse: 1780.4	valid_1's rmse: 1710.03
[941]	training's rmse: 1780.33	valid_1's rmse: 1709.98
[942]	training's rmse: 1780.33	valid_1's rmse: 1709.98
[943]	training's rmse: 1780.25	valid_1's rmse: 1709.99
[944]	training's rmse: 1779.47	valid_1's rmse: 1709.38
[945]	training's rmse: 1779	valid_1's rmse: 1708.59
[946]	training's 

[77]	training's rmse: 1332.31	valid_1's rmse: 1323.22
[78]	training's rmse: 1326.98	valid_1's rmse: 1319.83
[79]	training's rmse: 1321.27	valid_1's rmse: 1319.32
[80]	training's rmse: 1319.69	valid_1's rmse: 1318.75
[81]	training's rmse: 1319.16	valid_1's rmse: 1318.97
[82]	training's rmse: 1317	valid_1's rmse: 1317.51
[83]	training's rmse: 1315.81	valid_1's rmse: 1317.32
[84]	training's rmse: 1314.76	valid_1's rmse: 1317.61
[85]	training's rmse: 1314.26	valid_1's rmse: 1317.34
[86]	training's rmse: 1312.24	valid_1's rmse: 1318.36
[87]	training's rmse: 1307.17	valid_1's rmse: 1319.25
[88]	training's rmse: 1305.45	valid_1's rmse: 1318.46
[89]	training's rmse: 1303.19	valid_1's rmse: 1318.37
[90]	training's rmse: 1303.01	valid_1's rmse: 1318.48
[91]	training's rmse: 1302.85	valid_1's rmse: 1318.68
[92]	training's rmse: 1302.75	valid_1's rmse: 1318.8
[93]	training's rmse: 1302.35	valid_1's rmse: 1319.03
[94]	training's rmse: 1301.39	valid_1's rmse: 1319.3
[95]	training's rmse: 1300.71	val

[32]	training's rmse: 975.287	valid_1's rmse: 1181.13
[33]	training's rmse: 967.214	valid_1's rmse: 1176.52
[34]	training's rmse: 961.102	valid_1's rmse: 1173.04
[35]	training's rmse: 955.095	valid_1's rmse: 1169.69
[36]	training's rmse: 949.259	valid_1's rmse: 1167.05
[37]	training's rmse: 944.255	valid_1's rmse: 1164.14
[38]	training's rmse: 939.406	valid_1's rmse: 1162
[39]	training's rmse: 934.017	valid_1's rmse: 1161.05
[40]	training's rmse: 928.906	valid_1's rmse: 1161.5
[41]	training's rmse: 924.604	valid_1's rmse: 1158.94
[42]	training's rmse: 921.227	valid_1's rmse: 1156.53
[43]	training's rmse: 918.099	valid_1's rmse: 1155.19
[44]	training's rmse: 914.578	valid_1's rmse: 1155.14
[45]	training's rmse: 911.486	valid_1's rmse: 1153.73
[46]	training's rmse: 908.758	valid_1's rmse: 1152.31
[47]	training's rmse: 906.448	valid_1's rmse: 1150.31
[48]	training's rmse: 903.792	valid_1's rmse: 1149.06
[49]	training's rmse: 901.064	valid_1's rmse: 1147.6
[50]	training's rmse: 899.037	val

Unnamed: 0,feature,gain,split
128,sales_region_b3_6mo,5569.320738,193
0,month_indicator,3305.678841,79
8,sales_univ_brand_1mo,2060.45468,170
123,sales_univ_brand_cumstd,924.506836,124
180,region_n_genpract_perperson,451.002684,153
181,region_n_intmedandgen_perperson,450.139364,143
178,region_n_intmedicine_perperson,433.722148,158
166,region_area,418.859455,148
167,region_pci16,399.778019,149
182,region_n_pediat_perperson,382.927233,153


### 3.5 Prediction storage

In [5]:
submission = master[(master.sales.isna()) & (master.month >= '2020-07')].copy()
submission['sales'] = lgb_quantiles[0.5].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['lower'] = lgb_quantiles[0.2].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['upper'] = lgb_quantiles[0.8].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales', 'lower']))
submission = submission[['month', 'region', 'brand', 'sales', 'lower', 'upper']]

submission.loc[submission.sales < 0, 'sales'] = 0
submission.loc[submission.lower < 0, 'lower'] = 0
submission.loc[submission.upper < 0, 'upper'] = 0
submission.head()

Unnamed: 0,month,region,brand,sales,lower,upper
2714,2020-07,region_151,brand_1,0.0,9.476595e-35,186.045739
2715,2020-07,region_151,brand_2,0.0,9.476595e-35,186.045739
2716,2020-07,region_152,brand_1,34.962339,9.476595e-35,483.00679
2717,2020-07,region_152,brand_2,34.962339,9.476595e-35,483.00679
2718,2020-07,region_153,brand_1,72.211768,9.476595e-35,213.207448


In [6]:
submission.to_csv(os.path.join(results_path, 'submission5_team46.csv'), index = False)

In [7]:
(submission['upper'] - submission['lower']).mean()

# Submission 2 difference - 1815.437323083508
# Submission 3 difference - 1613.6602449420075
# Submission 4 difference - 1145.0976140382802
# Submission 5 difference - 1132.1785576554485

1132.1785576554485