In [1]:
import os, sys, math, gc
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
import seaborn as sns
import lightgbm as lgb
import pickle as pkl
from utils.utils import merge_eval_sold_on_df, sort_df_on_d, WRMSSE, RMSSE, _down_cast, data_preprocessing, diff_lists, log_status
from utils.utils import customIter, cross_validation_on_validation_set, ensemble_submissions, ensemble_submissions_uncertainty

from utils.configure_logger import configure_logger
configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [45]:
DATA_BASE_PATH = "../data/m5-forecasting-accuracy/"
SALES_EVALUATION = "sales_train_evaluation.csv"
SALES_VALIDATION = "sales_train_validation.csv"
CALENDAR = "calendar.csv"
SAMPLE_SUBMISSION = "sample_submission.csv"
SELL_PRICES = "sell_prices.csv"

PRECOMPUTED_BASE_PATH = "../data/uncertainty/features/"
TEST_PATH = 'test/'
precomputed_name = lambda store, eval_val: f"processed_{store}_{eval_val}.pkl"

PREDICTION_BASE_PATH = '../data/uncertainty/temp_submissions/'
SUBMISSION_BASE_PATH = '../data/uncertainty/final_submissions/'
DAYS: int = 28

SUB_D_START_VAL: int = 1914
SUB_D_START_EVAL: int = 1914 + 28

# the columns are always included after feature processing
# because they are required in the training and submission format
DROP_FEATURE_COLUMNS = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']

QUANTILES: int = [0.005, 0.025, 0.165, 0.25, 0.50, 0.75, 0.835, 0.975, 0.995]

AGG_LEVEL_COLUMNS = {
    "Level1": [], # no grouping, sum of all
    "Level2": ['state_id'],
    "Level3": ['store_id'],
    "Level4": ['cat_id'],
    "Level5": ['dept_id'],
    "Level6": ['state_id', 'cat_id'],
    "Level7": ['state_id', 'dept_id'],
    "Level8": ['store_id', 'cat_id'],
    "Level9": ['store_id', 'dept_id'],
    "Level10": ['item_id'],
    "Level11": ['state_id', 'item_id'],
    "Level12": ['item_id','store_id'],
}

### Define GridSearch functions

###

In [4]:
@log_status
def grid_search(params: dict, param_grid: dict, train_data: lgb.Dataset, validation_data: lgb.Dataset):
    """ 
    Given a grid with parameters, train lgb model for all possible combinations.
    Returns the parameter set with the best score and the dictionary with all results.
    """
    import itertools

    param_combinations = list(itertools.product(*param_grid.values()))
    results = {}
    for i, param_combination in enumerate(param_combinations,1):
        
        # create dictionary with all parameters
        param_combination = {k:v for k,v in zip(param_grid.keys(), param_combination)}
        param_combination.update(params)
        
        # train lgb model        
        temp_dict = {}
        mod: lgb.Booster = lgb.train(param_combination, 
            train_set = train_data,
            valid_sets = validation_data,
            evals_result = temp_dict
        )
        
        # store results
        results[f"combination_{i}"] = {}
        results[f"combination_{i}"] = {
            "params": param_combination,
            "validation_score": temp_dict["valid_0"]["rmse"][-1],
            "model": mod
        }
        
    # sort the results based on evaluation score
    sorted_results = dict(sorted(results.items(), key=lambda item: item[1]["validation_score"]))
    return list(sorted_results.values())[-1], results

### Quick Run for Testing (Cross Validation)

In [None]:
type_of = 'val'
features = pd.read_parquet(PRECOMPUTED_BASE_PATH + f'features_{type_of}.parquet')
features = _down_cast(features)

In [13]:
# total ~280 seconds
params = {
    'objective': 'tweedie',
    'metric': 'rmse', # Use Root Mean Squared Error (RMSE) as the evaluation metric
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': 1,
    'n_jobs': 4,
    "tweedie_variance_power": 1.1, # Set the Tweedie variance power (1 <= p <= 2)
    'eval_at': 10,
    # 'verbose_eval': 0
    'subsample': 0.5,
    'subsample_freq': 1,
    'feature_fraction': 0.5,
    'boost_from_average': False,
}

param_grid = {
    "num_leaves": [255], #[int(2**i) for i in [5, 6, 8]],
    'min_data_in_leaf': [255], #[int(2**i -1) for i in [5, 6, 8]]
    "learning_rate": [0.05],#[0.04, 0.02, 0.01, 0.005],
    "n_estimators": [45], # [5000, 500, 100]
    "tweedie_variance_power": [1.1], # Set the Tweedie variance power (1 <= p <= 2)
    # 'max_bin': [100]
}

In [4]:

# perform grid search for best parameters
group_columns: list = ['store_id']
for id, f in features.groupby(group_columns):
    # split data to training and testing
    x_train, x_test, y_train, y_test = train_test_split(f.drop(DROP_FEATURE_COLUMNS, axis=1), f['sold'], train_size=.8, shuffle=False, random_state=42)
    del f
    train_data = lgb.Dataset(x_train, y_train)
    del x_train, y_train
    validation_data = lgb.Dataset(x_test, y_test)
    del x_test, y_test

    best_combination, results = grid_search(params, param_grid, train_data = train_data, validation_data = validation_data)
    print(best_combination)

# # train_best_model
# mod = lgb.train(best_combination["params"],
#     train_set = train_data
# )
# predictions = mod.predict(features_predict)
# df_pred = pd.DataFrame({
#     "prod_id": id_predict,
#     "d": d_list_predict,
#     "sold": predictions
# })

# plot_best_model: bool = True
# if plot_best_model:
#     fig, axs = plt.subplots(3,1, figsize = (25,10))
    
#     # first figure (feature importance)
#     lgb.plot_importance(best_combination["model"], ax = axs[0])
    
#     # second figure (resid)
#     pred = best_combination["model"].predict(x_test)
#     resid = y_test - pred
#     axs[1].plot(resid)
    
#     # third figure (resid)
#     axs[2].hist(resid, bins = 400)
#     axs[2].set_xlim(-25, 25)
    
#     plt.show()

# Train + Predict submission

In [55]:
def train_level_all_quantiles(agg_level: str, type_of: str, exclude_columns: list = [], test: bool = False):

    # type_of = 'val'
    # test = False
    # agg_level = 'Level1'
    
    agg_columns = AGG_LEVEL_COLUMNS[agg_level]
    if len(agg_columns) == 0:
        agg_str: str = 'Total_X'
    elif len(agg_columns) == 1:
        agg_str: str = f'{agg_columns[0]}_X'
    else:
        agg_str: str = '_'.join(agg_columns)

    try:
        features = pd.DataFrame(features)
    except Exception:
        logger.info('(re)loading features')
        features = pd.read_parquet(PRECOMPUTED_BASE_PATH + (TEST_PATH if test else '') + f'features_{type_of}_{agg_str}.parquet')
        features = _down_cast(features)

    # group_columns = ['store_id', 'cat_id'] # 'store_id', 'cat_id', 'dept_id'
    # group_columns = ['store_id', 'dept_id']
    group_columns = agg_columns
    do_grid_search = False
    res: list = []

    exclude_prefix_list = exclude_columns # unconditional, auto, momentum, seasonal

    # for id, features_gr in customIter(features.groupby(group_columns)):
    #     # load grouped df
    #     features_gr: pd.DataFrame = features_gr.reset_index(drop=True)

    for i in range(1):
        features_gr = features.copy()
        
        features_gr = features_gr[[c for c in features_gr if c.split('_')[0] not in exclude_prefix_list]]
        
        # preparations
        sub_d_start = SUB_D_START_VAL if type_of == 'val' else SUB_D_START_EVAL
        train_idx = features_gr['sold'].notna() & features_gr['d'].isin([f'd_{sub_d_start - 1 - i}' for i in range(1460)])
        pred_idx = features_gr['d'].isin([f'd_{sub_d_start + i}' for i in range(28)])
        df_train = features_gr[train_idx]
        df_pred = features_gr[pred_idx]
        features_train: pd.DataFrame = df_train.drop(DROP_FEATURE_COLUMNS, axis = 1, errors = 'ignore')
        targets_train: pd.Series = df_train['sold']
        features_predict: pd.DataFrame = df_pred.drop(DROP_FEATURE_COLUMNS, axis = 1, errors = 'ignore')
            
        # train model for all quantiles
        for quantile in QUANTILES:
            
            # perform grid search for best parameters
            if do_grid_search == True:
                # split data to training and testing
                logger.info('divide for cross validation')
                x_train, x_test, y_train, y_test = train_test_split(features_train, targets_train, train_size=.8, shuffle=False, random_state=42)
                train_data = lgb.Dataset(x_train, y_train)
                validation_data = lgb.Dataset(x_test, y_test)
                logger.info('perform gridsearch')
                best_combination, results = grid_search(params, param_grid, train_data = train_data, validation_data = validation_data)
                del train_data; del validation_data
                params_grid_train = best_combination["params"]
            else:
                params_grid_train = {
                    'objective': 'quantile',
                    'metric': 'quantile', # Use Root Mean Squared Error (RMSE) as the evaluation metric
                    'boosting_type': 'gbdt',
                    'random_state': 43,
                    'verbose': -1,
                    'n_jobs': 4,
                    'subsample': 0.5,
                    'subsample_freq': 1,
                    "num_leaves": 7,#2**8-1,
                    'min_data_in_leaf': 15,#2**8-1,
                    'feature_fraction': 0.5, #.5
                    'bagging_fraction': .8,
                    "learning_rate": 0.2,
                    "n_estimators": 100,
                    # "max_bin": 100,
                    'boost_from_average': False,
                    # "tweedie_variance_power": 1.1, # Set the Tweedie variance power (1 <= p <= 2)
                    
                    'reg_sqrt': True,
                    'alpha': quantile,
                }

            # train_best_model
            mod = lgb.train(params_grid_train,
                train_set = lgb.Dataset(features_train, targets_train)
            )
            predictions = mod.predict(features_predict)
            # print(predictions)
            
            # store predictions
            df_p = pd.DataFrame(
                {
                    'pred': predictions,
                    'd': df_pred['d'],
                }
            )
            df_p['quantile'] = quantile
            df_p['Level'] = agg_level
            df_p['type_of'] = 'validation' if type_of == 'val' else 'evaluation'
            if len(agg_columns) == 0:
                df_p['agg_column1'] = 'Total'
                df_p['agg_column2'] = 'X'
            elif len(agg_columns) == 1:
                df_p['agg_column1'] = df_pred[agg_columns[0]].values
                df_p['agg_column2'] = 'X'
            else:
                df_p['agg_column1'] = df_pred[agg_columns[0]].values
                df_p['agg_column2'] = df_pred[agg_columns[1]].values
                
            df_p = df_p[['Level', 'agg_column1', 'agg_column2', 'd', 'quantile', 'pred', 'type_of']]
            
            res.append(_down_cast(df_p))

    del features
        
    # storing predictions
    df_sub_val = pd.concat(res)
    group_names = '_'.join(group_columns)
    if group_names == '':
        group_names = 'Total_X'
    exclude_names = 'None' if len(exclude_prefix_list) == 0 else '_'.join(exclude_prefix_list)
    df_sub_val.to_csv(PREDICTION_BASE_PATH + f'lgb_multivariate_{type_of}_non_transposed_{group_names}_exclude_{exclude_names}.csv', index = False)
    logger.info('saved under: ' + PREDICTION_BASE_PATH + f'lgb_multivariate_{type_of}_non_transposed_{group_names}_exclude_{exclude_names}.csv')

In [56]:
for agg_level in AGG_LEVEL_COLUMNS:
    train_level_all_quantiles(agg_level, type_of='val', exclude_columns=[])

2023-08-17 12:38:20 - __main__ - INFO - (re)loading features




2023-08-17 12:38:20 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_Total_X_exclude_None.csv
2023-08-17 12:38:20 - __main__ - INFO - (re)loading features




2023-08-17 12:38:21 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_state_id_exclude_None.csv
2023-08-17 12:38:21 - __main__ - INFO - (re)loading features




2023-08-17 12:38:22 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_store_id_exclude_None.csv
2023-08-17 12:38:22 - __main__ - INFO - (re)loading features




2023-08-17 12:38:23 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_cat_id_exclude_None.csv
2023-08-17 12:38:23 - __main__ - INFO - (re)loading features




2023-08-17 12:38:24 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_dept_id_exclude_None.csv
2023-08-17 12:38:24 - __main__ - INFO - (re)loading features




2023-08-17 12:38:25 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_state_id_cat_id_exclude_None.csv
2023-08-17 12:38:25 - __main__ - INFO - (re)loading features




2023-08-17 12:38:27 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_state_id_dept_id_exclude_None.csv
2023-08-17 12:38:27 - __main__ - INFO - (re)loading features




2023-08-17 12:38:29 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_store_id_cat_id_exclude_None.csv
2023-08-17 12:38:29 - __main__ - INFO - (re)loading features




2023-08-17 12:38:34 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_store_id_dept_id_exclude_None.csv
2023-08-17 12:38:34 - __main__ - INFO - (re)loading features




2023-08-17 12:40:30 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_item_id_exclude_None.csv
2023-08-17 12:40:30 - __main__ - INFO - (re)loading features




2023-08-17 12:49:15 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_state_id_item_id_exclude_None.csv
2023-08-17 12:49:15 - __main__ - INFO - (re)loading features




2023-08-17 13:21:42 - __main__ - INFO - saved under: ../data/uncertainty/temp_submissions/lgb_multivariate_val_non_transposed_item_id_store_id_exclude_None.csv


### Load val + eval prediction files and merge to one submission file

In [3]:
exclude_columns = '_'.join([])
if exclude_columns == '':
    exclude_columns = 'None'

dfs: list = []
for level in AGG_LEVEL_COLUMNS:
    agg_columns = AGG_LEVEL_COLUMNS[level]
    group_names = '_'.join(agg_columns)
    if group_names == '':
        group_names = 'Total_X'
        
    dfs.append(
        PREDICTION_BASE_PATH + f'lgb_multivariate_val_non_transposed_{group_names}_exclude_{exclude_columns}.csv',
    )

df_sub_val = ensemble_submissions_uncertainty(dfs)
transpose = True
if transpose == True:
    sub_validation = df_sub_val.pivot(index='id', columns='d', values='pred').reset_index(drop=False)
    sub_validation.columns = ["id"] + [f"F{i}" for i in range(1,DAYS+1)]

In [56]:
exclude_columns = '_'.join([])
if exclude_columns == '':
    exclude_columns = 'None'

dfs: list = []
for level in AGG_LEVEL_COLUMNS:
    agg_columns = AGG_LEVEL_COLUMNS[level]
    group_names = '_'.join(agg_columns)
    if group_names == '':
        group_names = 'Total_X'
        
    dfs.append(
        PREDICTION_BASE_PATH + f'lgb_multivariate_eval_non_transposed_{group_names}_exclude_{exclude_columns}.csv',
    )

df_sub_eval = ensemble_submissions_uncertainty(dfs)
transpose = True
if transpose == True:
    sub_evaluation = df_sub_eval.pivot(index='id', columns='d', values='pred').reset_index(drop=False)
    sub_evaluation.columns = ["id"] + [f"F{i}" for i in range(1,DAYS+1)]

In [57]:
# sub_evaluation = pd.read_csv('../submissions/submission_baseline_evaluation.csv').drop(['Unnamed: 0'], axis=1)
pd.concat([sub_validation, sub_evaluation]).to_csv(SUBMISSION_BASE_PATH + f'submission_lgb_ensemble{exclude_columns}.csv', index=False)
del sub_validation; del sub_evaluation

### For Validation Prediction, we can compute WRMSSE locally

In [21]:
# load precomputed
d = pd.read_parquet('../data/uncertainty/cv_template/temp.parquet')
df_sub_val['id_merge'] = df_sub_val['id'].str.split('.')\
    .apply(lambda x: x[0])
df_sub_val['quantile'] = df_sub_val['id'].str.split('.')\
    .apply(lambda x: float('.'.join([x[-2], x[-1].split('_')[0]])))

In [22]:
# load precomputed
d = _down_cast(d)
p = pd.merge(
    d,
    df_sub_val,
    how='left',
    on=['id_merge', 'd']
)
p['id_merge'] = p['id_merge'].astype(str)

In [23]:
p[p['d'] == 'd_1914'].head(5)

Unnamed: 0,Level,agg_column1,agg_column2,d,sold,revenue,id_merge,id,pred,quantile
1016,Level1,Total,X,d_1914,38784.0,123361.96875,Total_X,Total_X.0.005_validation,27088.0,0.005
1017,Level1,Total,X,d_1914,38784.0,123361.96875,Total_X,Total_X.0.025_validation,29776.0,0.025
1018,Level1,Total,X,d_1914,38784.0,123361.96875,Total_X,Total_X.0.175_validation,33184.0,0.175
1019,Level1,Total,X,d_1914,38784.0,123361.96875,Total_X,Total_X.0.25_validation,0.0,0.25
1020,Level1,Total,X,d_1914,38784.0,123361.96875,Total_X,Total_X.0.5_validation,0.0,0.5


In [35]:
WSPL(p)

Unnamed: 0,Level_id,agg_column1,agg_column2,Weight
0,Level1,Total,X,1.0
1,Level2,CA,X,0.442371
2,Level2,TX,X,0.269297
3,Level2,WI,X,0.288332
4,Level3,CA_1,X,0.110888


### Code Beneath can be used to create submission template
The submission template can be used to quickly insert your predictions.
It also contains all other (historical) sales to be able to compute the WRMSSE

In [4]:
sales_validation = pd.read_csv(DATA_BASE_PATH + SALES_VALIDATION)
sales_evaluation = pd.read_csv(DATA_BASE_PATH + SALES_EVALUATION)
calendar = pd.read_csv(DATA_BASE_PATH + CALENDAR)
sell_prices = pd.read_csv(DATA_BASE_PATH + SELL_PRICES)

df_val, submission_idx_val = data_preprocessing(sales_validation, calendar, sell_prices)
del sales_validation
df_eval, submission_idx_eval = data_preprocessing(sales_evaluation, calendar, sell_prices)
del sales_evaluation

df_val_after_release = df_val[(df_val.wm_yr_wk > df_val.release)]# & (df_val["sold"].notna())]
del df_val
df_eval_after_release = df_eval[(df_eval.wm_yr_wk > df_eval.release)]# & (df_eval["sold"].notna())]
del df_eval

In [7]:
dfs = []
df_eval_after_release['revenue'] = df_eval_after_release['sold'] * df_eval_after_release['sell_price']
for level in list(AGG_LEVEL_COLUMNS.keys()):
    c = AGG_LEVEL_COLUMNS[level]
    logger.info(level)
    agg_dict = {
        'sold': 'sum',
        'revenue': 'sum'
    }
    d1 = df_eval_after_release.groupby(c + ['d']).agg(agg_dict).reset_index(drop=False)
    d = pd.DataFrame({
        'd': d1['d'],
        'sold': d1['sold'],
        'revenue': d1['revenue']
    })
    if len(c) == 0:
        d['agg_column1'] = 'Total'
        d['agg_column2'] = 'X'
    elif len(c) == 1:
        d['agg_column1'] = d1[c[0]]
        d['agg_column2'] = 'X'
    else:
        d['agg_column1'] = d1[c[0]]
        d['agg_column2'] = d1[c[1]]
    d['id_merge'] = d['agg_column1'] + '_' + d['agg_column2']
    d['Level'] = level
    dfs.append(d[['Level', 'agg_column1', 'agg_column2', 'd', 'sold', 'revenue', 'id_merge']])
d = pd.concat(dfs)
d.head(50)
d.to_parquet('temp.parquet', index=False)

2023-08-17 17:27:39 - __main__ - INFO - Level1
2023-08-17 17:27:40 - __main__ - INFO - Level2
2023-08-17 17:27:44 - __main__ - INFO - Level3
2023-08-17 17:27:48 - __main__ - INFO - Level4
2023-08-17 17:27:51 - __main__ - INFO - Level5
2023-08-17 17:27:55 - __main__ - INFO - Level6
2023-08-17 17:28:00 - __main__ - INFO - Level7
2023-08-17 17:28:05 - __main__ - INFO - Level8
2023-08-17 17:28:10 - __main__ - INFO - Level9
2023-08-17 17:28:15 - __main__ - INFO - Level10
2023-08-17 17:28:21 - __main__ - INFO - Level11
2023-08-17 17:28:33 - __main__ - INFO - Level12


Unnamed: 0,Level,agg_column1,agg_column2,d,sold,revenue,id_merge
0,Level1,Total,X,d_10,24858.0,63029.78,Total_X
1,Level1,Total,X,d_100,23653.0,65665.71,Total_X
2,Level1,Total,X,d_1000,29241.0,82351.45,Total_X
3,Level1,Total,X,d_1001,33804.0,93975.55,Total_X
4,Level1,Total,X,d_1002,42447.0,118961.96,Total_X
5,Level1,Total,X,d_1003,40647.0,116052.48,Total_X
6,Level1,Total,X,d_1004,32039.0,89314.17,Total_X
7,Level1,Total,X,d_1005,29501.0,81688.96,Total_X
8,Level1,Total,X,d_1006,31117.0,85754.15,Total_X
9,Level1,Total,X,d_1007,27018.0,74244.86,Total_X
