In [1]:
import os, sys, math, gc
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
import seaborn as sns
import lightgbm as lgb
import pickle as pkl
from utils.utils import merge_eval_sold_on_df, sort_df_on_d, WRMSSE, RMSSE, _down_cast, data_preprocessing, diff_lists, log_status
from utils.utils import customIter, cross_validation_on_validation_set, ensemble_submissions

from utils.configure_logger import configure_logger
configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [11]:
DATA_BASE_PATH = "../data/m5-forecasting-accuracy/"
SALES_EVALUATION = "sales_train_evaluation.csv"
SALES_VALIDATION = "sales_train_validation.csv"
CALENDAR = "calendar.csv"
SAMPLE_SUBMISSION = "sample_submission.csv"
SELL_PRICES = "sell_prices.csv"

PRECOMPUTED_BASE_PATH = "../data/accuracy/features/"
precomputed_name = lambda store, eval_val: f"processed_{store}_{eval_val}.pkl"

PREDICTION_BASE_PATH = '../data/accuracy/temp_submissions/'
SUBMISSION_BASE_PATH = '../data/accuracy/submissions/'
DAYS: int = 28

SUB_D_START_VAL: int = 1914
SUB_D_START_EVAL: int = 1914 + 28

# the columns are always included after feature processing
# because they are required in the training and submission format
DROP_FEATURE_COLUMNS = ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']

### Define GridSearch functions

###

In [12]:
@log_status
def grid_search(params: dict, param_grid: dict, train_data: lgb.Dataset, validation_data: lgb.Dataset):
    """ 
    Given a grid with parameters, train lgb model for all possible combinations.
    Returns the parameter set with the best score and the dictionary with all results.
    """
    import itertools

    param_combinations = list(itertools.product(*param_grid.values()))
    results = {}
    for i, param_combination in enumerate(param_combinations,1):
        
        # create dictionary with all parameters
        param_combination = {k:v for k,v in zip(param_grid.keys(), param_combination)}
        param_combination.update(params)
        
        # train lgb model        
        temp_dict = {}
        mod: lgb.Booster = lgb.train(param_combination, 
            train_set = train_data,
            valid_sets = validation_data,
            evals_result = temp_dict
        )
        
        # store results
        results[f"combination_{i}"] = {}
        results[f"combination_{i}"] = {
            "params": param_combination,
            "validation_score": temp_dict["valid_0"]["rmse"][-1],
            "model": mod
        }
        
    # sort the results based on evaluation score
    sorted_results = dict(sorted(results.items(), key=lambda item: item[1]["validation_score"]))
    return list(sorted_results.values())[-1], results

### Quick Run for Testing (Cross Validation)

In [7]:
type_of = 'val'
features = pd.read_parquet(PRECOMPUTED_BASE_PATH + f'features_{type_of}.parquet')
features = _down_cast(features)

In [13]:
# total ~280 seconds
params = {
    'objective': 'tweedie',
    'metric': 'rmse', # Use Root Mean Squared Error (RMSE) as the evaluation metric
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': 1,
    'n_jobs': 4,
    "tweedie_variance_power": 1.1, # Set the Tweedie variance power (1 <= p <= 2)
    'eval_at': 10,
    # 'verbose_eval': 0
    'subsample': 0.5,
    'subsample_freq': 1,
    'feature_fraction': 0.5,
    'boost_from_average': False,
}

param_grid = {
    "num_leaves": [255], #[int(2**i) for i in [5, 6, 8]],
    'min_data_in_leaf': [255], #[int(2**i -1) for i in [5, 6, 8]]
    "learning_rate": [0.05],#[0.04, 0.02, 0.01, 0.005],
    "n_estimators": [45], # [5000, 500, 100]
    "tweedie_variance_power": [1.1], # Set the Tweedie variance power (1 <= p <= 2)
    # 'max_bin': [100]
}

In [4]:

# perform grid search for best parameters
group_columns: list = ['store_id']
for id, f in features.groupby(group_columns):
    # split data to training and testing
    x_train, x_test, y_train, y_test = train_test_split(f.drop(DROP_FEATURE_COLUMNS, axis=1), f['sold'], train_size=.8, shuffle=False, random_state=42)
    del f
    train_data = lgb.Dataset(x_train, y_train)
    del x_train, y_train
    validation_data = lgb.Dataset(x_test, y_test)
    del x_test, y_test

    best_combination, results = grid_search(params, param_grid, train_data = train_data, validation_data = validation_data)
    print(best_combination)

# # train_best_model
# mod = lgb.train(best_combination["params"],
#     train_set = train_data
# )
# predictions = mod.predict(features_predict)
# df_pred = pd.DataFrame({
#     "prod_id": id_predict,
#     "d": d_list_predict,
#     "sold": predictions
# })

# plot_best_model: bool = True
# if plot_best_model:
#     fig, axs = plt.subplots(3,1, figsize = (25,10))
    
#     # first figure (feature importance)
#     lgb.plot_importance(best_combination["model"], ax = axs[0])
    
#     # second figure (resid)
#     pred = best_combination["model"].predict(x_test)
#     resid = y_test - pred
#     axs[1].plot(resid)
    
#     # third figure (resid)
#     axs[2].hist(resid, bins = 400)
#     axs[2].set_xlim(-25, 25)
    
#     plt.show()

# Train + Predict submission

In [None]:
features = pd.read_parquet(PRECOMPUTED_BASE_PATH + f'features_{type_of}.parquet')
features = _down_cast(features)

In [45]:
type_of = 'val'

try:
    features = pd.DataFrame(features)
except Exception:
    logger.info('(re)loading features')
    features = pd.read_parquet(PRECOMPUTED_BASE_PATH + f'features_{type_of}.parquet')
    features = _down_cast(features)

# group_columns = ['store_id', 'cat_id'] # 'store_id', 'cat_id', 'dept_id'
# group_columns = ['store_id', 'dept_id']
group_columns = ['store_id']
do_grid_search = False
res: list = []

exclude_prefix_list = ['auto'] # unconditional, auto, momentum, seasonal

for id, features_gr in customIter(features.groupby(group_columns)):
    
    # logger.info('selecting train and pred indices')
    # prepare df
    features_gr: pd.DataFrame = features_gr.reset_index(drop=True)
    features_gr = features_gr[[c for c in features_gr if c.split('_')[0] not in exclude_prefix_list]]
    
    sub_d_start = SUB_D_START_VAL if type_of == 'val' else SUB_D_START_EVAL
    train_idx = features_gr['sold'].notna() & features_gr['d'].isin([f'd_{sub_d_start - 1 - i}' for i in range(1460)])
    pred_idx = features_gr['d'].isin([f'd_{sub_d_start + i}' for i in range(28)])
    df_train = features_gr[train_idx]
    df_pred = features_gr[pred_idx]
    features_train: pd.DataFrame = df_train.drop(DROP_FEATURE_COLUMNS, axis = 1)
    targets_train: pd.Series = df_train['sold']
    features_predict: pd.DataFrame = df_pred.drop(DROP_FEATURE_COLUMNS, axis = 1)
    
    # perform grid search for best parameters
    if do_grid_search == True:
        # split data to training and testing
        logger.info('divide for cross validation')
        x_train, x_test, y_train, y_test = train_test_split(features_train, targets_train, train_size=.8, shuffle=False, random_state=42)
        train_data = lgb.Dataset(x_train, y_train)
        validation_data = lgb.Dataset(x_test, y_test)
        logger.info('perform gridsearch')
        best_combination, results = grid_search(params, param_grid, train_data = train_data, validation_data = validation_data)
        del train_data; del validation_data
        params_grid_train = best_combination["params"]
    else:
        params_grid_train = {
            'objective': 'tweedie',
            'metric': 'rmse', # Use Root Mean Squared Error (RMSE) as the evaluation metric
            'boosting_type': 'gbdt',
            'random_state': 43,
            'verbose': -1,
            'n_jobs': 4,
            'subsample': 0.5,
            'subsample_freq': 1,
            "num_leaves": 2**8-1,
            'min_data_in_leaf': 2**8-1,
            'feature_fraction': 0.5,
            "learning_rate": 0.05,
            "n_estimators": 45,
            # "max_bin": 100,
            'boost_from_average': False,
            "tweedie_variance_power": 1.1, # Set the Tweedie variance power (1 <= p <= 2)
        }

    # train_best_model
    # logger.info('train final model')
    mod = lgb.train(params_grid_train,
        train_set = lgb.Dataset(features_train, targets_train)
    )
    predictions = mod.predict(features_predict)
    df_pred = pd.DataFrame({
        "id": df_pred['id'],
        "d": df_pred['d'],
        "pred": predictions
    })
    res.append(_down_cast(df_pred))
    
del features
    
# storing predictions
df_sub_val = pd.concat(res)
group_names = '_'.join(group_columns)
exclude_names = 'None' if len(exclude_prefix_list) == 0 else '_'.join(exclude_prefix_list)
df_sub_val.to_csv(PREDICTION_BASE_PATH + f'lgb_multivariate_{type_of}_non_transposed_{group_names}_exclude_{exclude_names}.csv', index = False)


2023-08-10 23:43:50 - __main__ - INFO - (re)loading features


10 / 10

### Load val + eval prediction files and merge to one submission file

In [55]:
# df_sub_val = pd.read_csv(PREDICTION_BASE_PATH + 'lgb_multivariate_val_non_transposed_temp.csv')
exclude_columns = '_exclude_auto'
# exclude_columns = ''
df_sub_val = ensemble_submissions(
    [
        PREDICTION_BASE_PATH + f'lgb_multivariate_val_non_transposed_store_id_cat_id{exclude_columns}.csv', 
        PREDICTION_BASE_PATH + f'lgb_multivariate_val_non_transposed_store_id_dept_id{exclude_columns}.csv', 
        PREDICTION_BASE_PATH + f'lgb_multivariate_val_non_transposed_store_id{exclude_columns}.csv', 

    ]
)
transpose = True
if transpose == True:
    sub_validation = df_sub_val.pivot(index='id', columns='d', values='pred').reset_index(drop=False)
    sub_validation.columns = ["id"] + [f"F{i}" for i in range(1,DAYS+1)]

In [56]:
# df_sub_eval = pd.read_csv(PREDICTION_BASE_PATH + 'lgb_multivariate_eval_non_transposed_temp.csv')
exclude_columns = '_exclude_auto'
df_sub_eval = ensemble_submissions(
    [
        PREDICTION_BASE_PATH + f'lgb_multivariate_eval_non_transposed_store_id_cat_id{exclude_columns}.csv', 
        PREDICTION_BASE_PATH + f'lgb_multivariate_eval_non_transposed_store_id_dept_id{exclude_columns}.csv', 
        PREDICTION_BASE_PATH + f'lgb_multivariate_eval_non_transposed_store_id{exclude_columns}.csv', 

    ]
)
transpose = True
if transpose == True:
    sub_evaluation = df_sub_eval.pivot(index='id', columns='d', values='pred').reset_index(drop=False)
    sub_evaluation.columns = ["id"] + [f"F{i}" for i in range(1,DAYS+1)]

In [57]:
# sub_evaluation = pd.read_csv('../submissions/submission_baseline_evaluation.csv').drop(['Unnamed: 0'], axis=1)
pd.concat([sub_validation, sub_evaluation]).to_csv(SUBMISSION_BASE_PATH + f'submission_lgb_ensemble{exclude_columns}.csv', index=False)
del sub_validation; del sub_evaluation

### For Validation Prediction, we can compute WRMSSE locally

In [58]:
cross_validation_on_validation_set(apply_max_zero=False, df_sub_val=df_sub_val)

2023-08-10 23:50:30 - utils - INFO - reading cross validation template
2023-08-10 23:50:43 - utils - INFO - reading prediction file
2023-08-10 23:50:43 - utils - INFO - merging both files
2023-08-10 23:51:12 - utils - INFO - reading weights file
2023-08-10 23:51:14 - utils - INFO - Level1 - 1.0359742905071732
2023-08-10 23:51:17 - utils - INFO - Level2 - 1.000830124399351
2023-08-10 23:51:20 - utils - INFO - Level3 - 0.9942979228305507
2023-08-10 23:51:23 - utils - INFO - Level4 - 0.999022628513321
2023-08-10 23:51:27 - utils - INFO - Level5 - 1.0373542381165712
2023-08-10 23:51:31 - utils - INFO - Level6 - 0.9802534582278319
2023-08-10 23:51:35 - utils - INFO - Level7 - 1.0044214089143895
2023-08-10 23:51:40 - utils - INFO - Level8 - 0.9717563379210999
2023-08-10 23:51:44 - utils - INFO - Level9 - 0.9787406701482261
2023-08-10 23:51:52 - utils - INFO - Level10 - 1.0464210558544118
2023-08-10 23:52:13 - utils - INFO - Level11 - 0.9649983926241542
2023-08-10 23:53:15 - utils - INFO - Le

### Code Beneath can be used to create submission template
The submission template can be used to quickly insert your predictions.
It also contains all other (historical) sales to be able to compute the WRMSSE

In [None]:
# sales_validation = pd.read_csv(DATA_BASE_PATH + SALES_VALIDATION)
# sales_evaluation = pd.read_csv(DATA_BASE_PATH + SALES_EVALUATION)
# calendar = pd.read_csv(DATA_BASE_PATH + CALENDAR)
# sell_prices = pd.read_csv(DATA_BASE_PATH + SELL_PRICES)

# df_val, submission_idx_val = data_preprocessing(sales_validation, calendar, sell_prices)
# del sales_validation
# df_eval, submission_idx_eval = data_preprocessing(sales_evaluation, calendar, sell_prices)
# del sales_evaluation

# df_val_after_release = df_val[(df_val.wm_yr_wk > df_val.release)]# & (df_val["sold"].notna())]
# del df_val
# df_eval_after_release = df_eval[(df_eval.wm_yr_wk > df_eval.release)]# & (df_eval["sold"].notna())]

In [6]:
# # for validation, combine 'training' df with 'prediction' df
# df_sub_merged = pd.merge(
#     df_val_after_release,
#     df_sub_val,
#     how = 'outer',
#     on = ['id', 'd']
# ).reset_index(drop=True)
# df_sub_merged['item_id'] = df_sub_merged['item_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[0:3])))
# df_sub_merged['dept_id'] = df_sub_merged['dept_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[0:2])))
# df_sub_merged['cat_id'] = df_sub_merged['cat_id'].fillna(df_sub_merged['id'].apply(lambda x: x.split('_')[0]))
# df_sub_merged['store_id'] = df_sub_merged['store_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[3:5])))
# df_sub_merged['state_id'] = df_sub_merged['state_id'].fillna(df_sub_merged['id'].apply(lambda x: x.split('_')[3]))

In [7]:
# # merge 'true' sold values on dataframe, even for the 'out of sample' ones and sort for safety again
# df_sub_merged = merge_eval_sold_on_df(df_sub_merged, df_eval = df_eval)
# df_sub_merged = sort_df_on_d(df_sub_merged)
# df_sub_merged.to_csv('../submissions/base_cross_validation_template.csv', index = False)