In [1]:
import sys
sys.path.append('..')
from src.utilities import *
from src.models.train_model import train_model
from src.models.param_opt import bayes_parameter_opt_lgb

## 3. Modelling

### 3.1 Load master table

In [2]:
master = pd.read_csv(os.path.join(processed_path, 'master.csv'))

### 3.2 Train / val / test split

In [3]:
x_train = master[(master.sales.isna()==False)].drop(columns = [ 'region', 'brand']).copy()
x_train = x_train[x_train.month >= '2020-06']
x_train['train'] =  (x_train.month <= '2021-06').astype(float)
x_train.drop(columns = 'month', inplace = True)

### 3.3 Parameter search

In [4]:
# best_params = bayes_parameter_opt_lgb(X = x_train.drop(columns = ['sales', 'train']), y = x_train.sales,
#                                       init_round=20, opt_round=10, n_folds=5, random_seed=6, n_estimators=10000, 
#                                       learning_rate=0.01, save_path = '')
# best_params = best_params.max['params']
# best_params

### 3.4 Model training

In [4]:
##### Define best parameters found earlier
best_params = {'metric': 'rmse',
               'bagging_fraction': 0.8006698513805078,
               'feature_fraction': 0.5893069765996153,
               'lambda_l1': 1.7269089775048563,
               'lambda_l2': 1.436385188986018,
               'max_depth': 17,
               'min_child_weight': 13.774836222637536,
               'min_split_gain': 0.03310192478518545,
               'num_leaves': 44}

##### Train a quantile regression lgb at different alpha levels
quantile_alphas = [0.2, 0.5, 0.8]
lgb_quantiles = {}

for alpha in quantile_alphas:
    current_model = train_model(x_train, target_name = 'sales', 
                                model_type = 'lgb-quantile', 
                                quantile_alpha = alpha,
                                params = best_params, 
                                metric = 'rmse', 
                                save_path = os.path.join(models_path, 
                                                         'model2_quantile_'+ str(alpha)+'.pkl'))
    lgb_quantiles[alpha] = current_model
    
##### Visualize feature contributions
pd.DataFrame({'feature': x_train.drop(columns = ['sales', 'train']).columns, 
              'gain': lgb_quantiles[0.5].feature_importance(importance_type = 'gain'),
              'split': lgb_quantiles[0.5].feature_importance(importance_type = 'split')
             }).sort_values('gain', ascending = False).head()

# 0.2 - 1663.31 2678.46
# 0.5 -  1069.4 1926.61
# 0.8 - 912.067 1618.18

####################     training with  3926      ####################
####################     validating with  604      ####################
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7091
[LightGBM] [Info] Number of data points in the train set: 3926, number of used features: 87
[LightGBM] [Info] Start training from score 32.740002
[1]	training's rmse: 2948.64	valid_1's rmse: 4475.68
Training until validation scores don't improve for 20 rounds
[2]	training's rmse: 2924.32	valid_1's rmse: 4445
[3]	training's rmse: 2897.12	valid_1's rmse: 4410.75
[4]	training's rmse: 2870.73	valid_1's rmse: 4374.15
[5]	training's rmse: 2832.08	valid_1's rmse: 4320.89
[6]	training's rmse: 2793.75	valid_1's rmse: 4269.69
[7]	training's rmse: 2766.68	valid_1's rmse: 4226.36
[8]	training's rmse: 2729.15	valid_1's rmse: 4164.36
[9]	training's rmse: 2693.32	valid_1's rmse: 4116.19
[10]	training's rmse: 2667.96	valid_1's rmse: 4075.49
[11]	training's rmse: 2650.27	

[142]	training's rmse: 1874.79	valid_1's rmse: 2972.43
[143]	training's rmse: 1874.6	valid_1's rmse: 2972.43
[144]	training's rmse: 1873.65	valid_1's rmse: 2971.43
[145]	training's rmse: 1873.3	valid_1's rmse: 2972.2
[146]	training's rmse: 1872.65	valid_1's rmse: 2971.95
[147]	training's rmse: 1872.12	valid_1's rmse: 2971.34
[148]	training's rmse: 1870.65	valid_1's rmse: 2970.48
[149]	training's rmse: 1870.69	valid_1's rmse: 2970.61
[150]	training's rmse: 1870.37	valid_1's rmse: 2970.94
[151]	training's rmse: 1868.44	valid_1's rmse: 2970.18
[152]	training's rmse: 1868.22	valid_1's rmse: 2970.03
[153]	training's rmse: 1868.13	valid_1's rmse: 2969.91
[154]	training's rmse: 1867.96	valid_1's rmse: 2969.84
[155]	training's rmse: 1867.66	valid_1's rmse: 2970
[156]	training's rmse: 1867.58	valid_1's rmse: 2970
[157]	training's rmse: 1867.28	valid_1's rmse: 2969.26
[158]	training's rmse: 1867.2	valid_1's rmse: 2969.32
[159]	training's rmse: 1863.92	valid_1's rmse: 2965.84
[160]	training's rms

[301]	training's rmse: 1793.1	valid_1's rmse: 2882.14
[302]	training's rmse: 1792.89	valid_1's rmse: 2882.08
[303]	training's rmse: 1792.44	valid_1's rmse: 2880.14
[304]	training's rmse: 1792.29	valid_1's rmse: 2880.26
[305]	training's rmse: 1787.42	valid_1's rmse: 2872.31
[306]	training's rmse: 1784.84	valid_1's rmse: 2867.28
[307]	training's rmse: 1782.25	valid_1's rmse: 2860.83
[308]	training's rmse: 1781.23	valid_1's rmse: 2859.62
[309]	training's rmse: 1781.22	valid_1's rmse: 2859.51
[310]	training's rmse: 1779.62	valid_1's rmse: 2859.5
[311]	training's rmse: 1776.18	valid_1's rmse: 2856.28
[312]	training's rmse: 1772.76	valid_1's rmse: 2852
[313]	training's rmse: 1771.98	valid_1's rmse: 2850.62
[314]	training's rmse: 1769.04	valid_1's rmse: 2843.88
[315]	training's rmse: 1767.82	valid_1's rmse: 2835.07
[316]	training's rmse: 1766.6	valid_1's rmse: 2826.52
[317]	training's rmse: 1766.16	valid_1's rmse: 2826.43
[318]	training's rmse: 1765.99	valid_1's rmse: 2826.21
[319]	training's

[459]	training's rmse: 1698.17	valid_1's rmse: 2726.1
[460]	training's rmse: 1698.13	valid_1's rmse: 2726.08
[461]	training's rmse: 1698.11	valid_1's rmse: 2726.01
[462]	training's rmse: 1698.05	valid_1's rmse: 2725.99
[463]	training's rmse: 1697.91	valid_1's rmse: 2725.99
[464]	training's rmse: 1697.73	valid_1's rmse: 2726.04
[465]	training's rmse: 1697.73	valid_1's rmse: 2726.02
[466]	training's rmse: 1697.61	valid_1's rmse: 2725.87
[467]	training's rmse: 1697.31	valid_1's rmse: 2725.46
[468]	training's rmse: 1697.28	valid_1's rmse: 2725.45
[469]	training's rmse: 1697.14	valid_1's rmse: 2725.25
[470]	training's rmse: 1696.75	valid_1's rmse: 2724.92
[471]	training's rmse: 1696.76	valid_1's rmse: 2724.92
[472]	training's rmse: 1696.74	valid_1's rmse: 2724.9
[473]	training's rmse: 1696.74	valid_1's rmse: 2724.91
[474]	training's rmse: 1696.32	valid_1's rmse: 2724.78
[475]	training's rmse: 1695.85	valid_1's rmse: 2724.04
[476]	training's rmse: 1690.54	valid_1's rmse: 2715.64
[477]	traini

[43]	training's rmse: 1401.16	valid_1's rmse: 2320.2
[44]	training's rmse: 1398.37	valid_1's rmse: 2317.9
[45]	training's rmse: 1392.91	valid_1's rmse: 2309.78
[46]	training's rmse: 1390.14	valid_1's rmse: 2308.57
[47]	training's rmse: 1386.7	valid_1's rmse: 2304.78
[48]	training's rmse: 1384.72	valid_1's rmse: 2304.29
[49]	training's rmse: 1383.24	valid_1's rmse: 2301.82
[50]	training's rmse: 1382.59	valid_1's rmse: 2302.26
[51]	training's rmse: 1380.67	valid_1's rmse: 2298.12
[52]	training's rmse: 1380.06	valid_1's rmse: 2297.56
[53]	training's rmse: 1379.72	valid_1's rmse: 2297.64
[54]	training's rmse: 1373.02	valid_1's rmse: 2282.74
[55]	training's rmse: 1367.47	valid_1's rmse: 2277.49
[56]	training's rmse: 1365.64	valid_1's rmse: 2277.25
[57]	training's rmse: 1364.29	valid_1's rmse: 2275.67
[58]	training's rmse: 1362.8	valid_1's rmse: 2275.51
[59]	training's rmse: 1361.03	valid_1's rmse: 2273.59
[60]	training's rmse: 1358.44	valid_1's rmse: 2269.32
[61]	training's rmse: 1357.47	va

[202]	training's rmse: 1102.69	valid_1's rmse: 1961.17
[203]	training's rmse: 1102.17	valid_1's rmse: 1961.13
[204]	training's rmse: 1102.09	valid_1's rmse: 1961.14
[205]	training's rmse: 1102.01	valid_1's rmse: 1961.12
[206]	training's rmse: 1101.91	valid_1's rmse: 1961.07
[207]	training's rmse: 1101.65	valid_1's rmse: 1960.95
[208]	training's rmse: 1101.61	valid_1's rmse: 1961.19
[209]	training's rmse: 1100.89	valid_1's rmse: 1961.27
[210]	training's rmse: 1100.7	valid_1's rmse: 1960.73
[211]	training's rmse: 1100.17	valid_1's rmse: 1960.6
[212]	training's rmse: 1099.87	valid_1's rmse: 1960.26
[213]	training's rmse: 1099.68	valid_1's rmse: 1960.19
[214]	training's rmse: 1099.62	valid_1's rmse: 1960.36
[215]	training's rmse: 1099.6	valid_1's rmse: 1960.36
[216]	training's rmse: 1099.15	valid_1's rmse: 1960.19
[217]	training's rmse: 1099.05	valid_1's rmse: 1960.19
[218]	training's rmse: 1098.35	valid_1's rmse: 1959.63
[219]	training's rmse: 1097.74	valid_1's rmse: 1959.54
[220]	trainin

[77]	training's rmse: 971.348	valid_1's rmse: 1652.11
[78]	training's rmse: 970.739	valid_1's rmse: 1651.96
[79]	training's rmse: 969.24	valid_1's rmse: 1651.81
[80]	training's rmse: 967.802	valid_1's rmse: 1651.06
[81]	training's rmse: 966.885	valid_1's rmse: 1650.71
[82]	training's rmse: 966.405	valid_1's rmse: 1650.07
[83]	training's rmse: 966.098	valid_1's rmse: 1650.02
[84]	training's rmse: 965.571	valid_1's rmse: 1649.5
[85]	training's rmse: 965.22	valid_1's rmse: 1649.24
[86]	training's rmse: 964.684	valid_1's rmse: 1648.69
[87]	training's rmse: 964.624	valid_1's rmse: 1648.66
[88]	training's rmse: 964.191	valid_1's rmse: 1648.57
[89]	training's rmse: 963.652	valid_1's rmse: 1648.21
[90]	training's rmse: 963.119	valid_1's rmse: 1647.74
[91]	training's rmse: 962.542	valid_1's rmse: 1647.61
[92]	training's rmse: 961.671	valid_1's rmse: 1647.29
[93]	training's rmse: 960.839	valid_1's rmse: 1646.76
[94]	training's rmse: 960.67	valid_1's rmse: 1646.6
[95]	training's rmse: 960.054	val

[268]	training's rmse: 913.562	valid_1's rmse: 1622.05
[269]	training's rmse: 913.413	valid_1's rmse: 1622.02
[270]	training's rmse: 913.356	valid_1's rmse: 1621.92
[271]	training's rmse: 912.847	valid_1's rmse: 1621.53
[272]	training's rmse: 912.79	valid_1's rmse: 1621.59
[273]	training's rmse: 912.603	valid_1's rmse: 1621.71
[274]	training's rmse: 912.392	valid_1's rmse: 1621.64
[275]	training's rmse: 912.267	valid_1's rmse: 1621.59
[276]	training's rmse: 912.058	valid_1's rmse: 1621.49
Early stopping, best iteration is:
[256]	training's rmse: 912.067	valid_1's rmse: 1618.18


Unnamed: 0,feature,gain,split
0,month_indicator,10952.906425,223
19,sales_univ_b12_market_3mo,3684.414754,75
8,sales_univ_brand_1mo,1985.127547,411
22,sales_univ_b12_market_trend_3mo,1612.203864,40
31,sales_univ_b3_3mo,978.02335,36


In [None]:
# def plot_data(x_plot, X_train, X_test, y_train, y_test, low, high):
#     """plot training and testing data"""
#     s = 15
#     plt.plot(x_plot, ground_truth(x_plot), alpha=0.5, label='ground truth')
#     plt.scatter(X_train, y_train, s=s, alpha=0.2)
#     plt.scatter(X_test, y_test, s=s, alpha=0.2, color='red')
#     plt.xlim((low, high))
#     plt.ylabel('y')
#     plt.xlabel('x')
#     plt.legend(loc='upper left')
#     plt.show()
    
    
    
# for quantile_alpha, lgb in lgb_quantile_alphas.items():
#     plt.plot(x_plot, lgb.predict(x_plot[:, np.newaxis]),
#              label='LGB quantile alpha: {}'.format(quantile_alpha),
#              alpha=0.9, linewidth=2)

# plot_data(x_plot, X_train, X_test, y_train, y_test, low, high)

### 3.5 Prediction storage

In [5]:
submission = master[(master.sales.isna()) & (master.month >= '2020-07')].copy()
submission['sales'] = lgb_quantiles[0.5].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['lower'] = lgb_quantiles[0.2].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['upper'] = lgb_quantiles[0.8].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales', 'lower']))
submission = submission[['month', 'region', 'brand', 'sales', 'lower', 'upper']]

submission.loc[submission.sales < 0, 'sales'] = 0
submission.loc[submission.lower < 0, 'lower'] = 0
submission.loc[submission.upper < 0, 'upper'] = 0
submission.head()

Unnamed: 0,month,region,brand,sales,lower,upper
2714,2020-07,region_151,brand_1,8.886604,4.66403e-24,211.424671
2715,2020-07,region_151,brand_2,8.886604,4.66403e-24,211.424671
2716,2020-07,region_152,brand_1,0.0,4.66403e-24,463.115304
2717,2020-07,region_152,brand_2,0.0,4.66403e-24,463.115304
2718,2020-07,region_153,brand_1,83.770384,4.66403e-24,505.151868


In [6]:
submission.to_csv(os.path.join(results_path, 'submission4_team46.csv'), index = False)

In [7]:
(submission['upper'] - submission['lower']).mean()

# Submission 2 difference - 1815.437323083508
# Submission 3 difference - 1613.6602449420075
# Submission 4 difference - 1145.0976140382802

1145.0976140382802