In [1]:
import sys
sys.path.append('..')
from src.utilities import *
from src.models.train_model import train_model
from src.models.param_opt import bayes_parameter_opt_lgb

## 3. Modelling

### 3.1 Load master table

In [2]:
master = pd.read_csv(os.path.join(processed_path, 'master.csv'))

### 3.2 Train / val / test split

In [3]:
x_train = master[(master.sales.isna()==False)].drop(columns = ['region', 'brand']).copy()
x_train = x_train[x_train.month >= '2020-06']
x_train['train'] =  (x_train.month <= '2021-06').astype(float)
x_train.drop(columns = 'month', inplace = True)

### 3.3 Parameter search

In [4]:
# best_params = bayes_parameter_opt_lgb(X = x_train.drop(columns = ['sales', 'train']), y = x_train.sales,
#                                       init_round=30, opt_round=30, n_folds=6, random_seed=6, n_estimators=10000, 
#                                       learning_rate=0.01, save_path = '')
# best_params = best_params.max['params']
# best_params

### 3.4 Model training

In [4]:
##### Define best parameters found earlier
best_params = {'metric': 'rmse',
               'bagging_fraction': 1,
               'feature_fraction': 0.9,
               'lambda_l1': 0,
               'lambda_l2': 0,
               'max_depth': 13,
               'min_child_weight': 7.720200312255985,
               'min_split_gain': 0.1,
               'num_leaves': 45}

##### Train a quantile regression lgb at different alpha levels
quantile_alphas = [0.2, 0.5, 0.8]
lgb_quantiles = {}

for alpha in quantile_alphas:
    current_model = train_model(x_train, target_name = 'sales', 
                                model_type = 'lgb-quantile', 
                                quantile_alpha = alpha,
                                params = best_params, 
                                metric = 'rmse', 
                                split = 'in_sample',
                                save_path = os.path.join(models_path, 
                                                         'model2_quantile_'+ str(alpha)+'.pkl'))
    lgb_quantiles[alpha] = current_model
    
##### Visualize feature contributions
feature_contributions = pd.DataFrame({'feature': x_train.drop(columns = ['sales', 'train']).columns, 
                                      'gain': lgb_quantiles[0.5].feature_importance(importance_type = 'gain'),
                                      'split': lgb_quantiles[0.5].feature_importance(importance_type = 'split')
                                     }).sort_values('gain', ascending = False)

feature_contributions[feature_contributions.gain>0][:40]

# 0.2 - 1663.31 2678.46
# 0.5 -  1069.4 1926.61
# 0.8 - 912.067 1618.18

TypeError: train_model() got an unexpected keyword argument 'split'

### 3.5 Prediction storage

In [7]:
submission = master[(master.sales.isna()) & (master.month >= '2020-07')].copy()
submission['sales'] = lgb_quantiles[0.5].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['lower'] = lgb_quantiles[0.2].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales']))
submission['upper'] = lgb_quantiles[0.8].predict(submission.drop(columns = ['month', 'region', 'brand', 'sales', 'lower']))
submission = submission[['month', 'region', 'brand', 'sales', 'lower', 'upper']]

submission.loc[submission.sales < 0, 'sales'] = 0
submission.loc[submission.lower < 0, 'lower'] = 0
submission.loc[submission.upper < 0, 'upper'] = 0
submission.head()

Unnamed: 0,month,region,brand,sales,lower,upper
2714,2020-07,region_151,brand_1,9.42772,1.759659e-28,0.0
2715,2020-07,region_151,brand_2,9.42772,1.759659e-28,0.0
2716,2020-07,region_152,brand_1,35.910898,1.759659e-28,580.265151
2717,2020-07,region_152,brand_2,35.910898,1.759659e-28,580.265151
2718,2020-07,region_153,brand_1,57.125015,1.759659e-28,92.882347


In [8]:
submission.to_csv(os.path.join(results_path, 'submission5_team46.csv'), index = False)

In [9]:
(submission['upper'] - submission['lower']).mean()

# Submission 2 difference - 1815.437323083508
# Submission 3 difference - 1613.6602449420075
# Submission 4 difference - 1145.0976140382802
# Submission 5 difference - 1017.6503796655638

1017.6503796655638