In [2]:

import os, sys, math, gc
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import seaborn as sns
import lightgbm as lgb
from utils.utils import merge_eval_sold_on_df, sort_df_on_d, WRMSSE, RMSSE, _down_cast, data_preprocessing #create_submission_df
from utils.utils import customIter
from utils.configure_logger import configure_logger
configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [38]:
DATA_BASE_PATH = "../data/m5-forecasting-accuracy/"
SUBMISSION_BASE_PATH = '../data/accuracy/submissions/'
SALES_EVALUATION = "sales_train_evaluation.csv"
SALES_VALIDATION = "sales_train_validation.csv"
CALENDAR = "calendar.csv"
SAMPLE_SUBMISSION = "sample_submission.csv"
SELL_PRICES = "sell_prices.csv"

DAYS: int = 28
D_START_VAL: int = 1914
D_START_EVAL: int = 1914 + 28

In [3]:
# read all data
sales_validation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_VALIDATION))
sales_evaluation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_EVALUATION))
calendar: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + CALENDAR))
sample_submission: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SAMPLE_SUBMISSION))
sell_prices: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SELL_PRICES))

### Feature Engineering and Cross Validation

In [4]:
df_val, submission_idx_val = data_preprocessing(sales_validation, calendar, sell_prices)
del sales_validation
df_eval, submission_idx_eval = data_preprocessing(sales_evaluation, calendar, sell_prices)
del sales_evaluation

In [25]:
df_val_after_release = df_val[(df_val.wm_yr_wk > df_val.release) & (df_val["sold"].notna())]
del df_val
df_eval_after_release = df_eval[(df_eval.wm_yr_wk > df_eval.release) & (df_eval["sold"].notna())]

NameError: name 'df_val' is not defined

In [None]:
# for id in df_val["id"].unique():
#     d

i = 0

import time

m = 10
forecast_periods = 28  # Number of periods to forecast

res = pd.DataFrame(columns=["id", "var", "std", "sold_tot"])
for id, df in df_val_after_release.groupby("id"):

    if i == 0:
        s = time.time()
        
    # df = df.reset_index(drop=True)
    ts_data = df["sold"].values + 1e-2
    
    trend = 'add'  # 'add' or 'mul' for additive or multiplicative trend
    seasonal = 'add'  # 'add' or 'mul' for additive or multiplicative seasonality
    seasonal_periods = 7  # Number of seasonal periods (12 for monthly data)

    # TRAIN
    fitted_model = ExponentialSmoothing(ts_data, seasonal = "add", seasonal_periods = 7).fit()#trend = "add", damped_trend = True)
    
    # EVALUATE
    fitted_values = fitted_model.fittedvalues
    resid_var = np.mean((df["sold"].values - np.array([max(v,0) for v in fitted_values]))**2)
    res.loc[i] = (id, resid_var, np.sqrt(resid_var), df["sold"].sum())

    # FORECAST
    forecast = fitted_model.forecast(forecast_periods)
    
    # plt.figure(figsize=(10, 6))
    # plt.plot(ts_data, label='Original Data')
    # plt.plot(fitted_model.fittedvalues, label='Fitted Values', color='green')
    # plt.plot(range(len(fitted_model.fittedvalues), len(fitted_model.fittedvalues) + forecast_periods), 
    #          forecast, color='red'
    # )

    # plt.legend()
    # plt.xlabel('Date')
    # plt.ylabel('Value')
    # plt.title('Exponential Smoothing with Seasonality')
    # plt.show()
    
    i += 1
    if i == m:
        break
    
logger.info('model time per id: ' + str((time.time() - s) / m))
    
res["std"].mean(), np.sum((res["std"] * res["sold_tot"]) / res["sold_tot"].sum())

0.05427438020706177


(nan, 0)

In [30]:
forecast_periods = 28  # Number of periods to forecast
d_pred = [f'd_{i}' for i in range(D_START_VAL, D_START_VAL+DAYS)]
       
def fit_predict_model(r):
    id, df = r[0], r[1] 
    ts_data = df["sold"].values + 1e-2
    
    trend = 'add'  # 'add' or 'mul' for additive or multiplicative trend
    seasonal = 'add'  # 'add' or 'mul' for additive or multiplicative seasonality
    seasonal_periods = 7  # Number of seasonal periods (12 for monthly data)

    # TRAIN
    fitted_model = ExponentialSmoothing(ts_data, seasonal = "add", seasonal_periods = 7).fit()#trend = "add", damped_trend = True)
    
    # EVALUATE
    # fitted_values = fitted_model.fittedvalues
    # resid_var = np.mean((df["sold"].values - np.array([max(v,0) for v in fitted_values]))**2)
    # res.loc[i] = (id, resid_var, np.sqrt(resid_var), df["sold"].sum())

    # FORECAST
    forecast = fitted_model.forecast(forecast_periods)
    return pd.DataFrame({'id': [id]*DAYS, 'd': d_pred, 'pred': forecast})

In [None]:
t = "val"

if t == "val":
    d_pred = [f'd_{i}' for i in range(D_START_VAL, D_START_VAL+DAYS)]

    r = [fit_predict_model((id, df)) for id, df in customIter(df_val_after_release.groupby("id"))]
    df_sub_val = pd.concat(r)
    logger.info('done')
    # store prediction file without transposed to submission format
    df_sub_val.to_csv(SUBMISSION_BASE_PATH + "submission_baseline_validation_not_transposed.csv", index = False)

    # load untransformed prediction file for VALIDATION
    df_sub_val = pd.read_csv(SUBMISSION_BASE_PATH + "submission_baseline_validation_not_transposed.csv")
    sub_validation = df_sub_val.pivot(index="id", columns="d", values="pred").reset_index(drop=False)
    sub_validation.columns = ["id"] + [f"F{i}" for i,_ in enumerate(range(DAYS),1)]
    sub_validation.to_csv(SUBMISSION_BASE_PATH + 'submission_baseline_validation.csv')
    
elif t == "eval":
    d_pred = [f'd_{i}' for i in range(D_START_EVAL, D_START_EVAL+DAYS)]

    r = [fit_predict_model((id, df)) for id, df in customIter(df_eval_after_release.groupby("id"))]
    df_sub_eval = pd.concat(r)
    logger.info('done')
    df_sub_eval.to_csv(SUBMISSION_BASE_PATH + "submission_baseline_evaluation_not_transposed.csv", index = False)
    
    # load untransformed prediction file for EVALUATION
    df_sub_eval = pd.read_csv(SUBMISSION_BASE_PATH + "submission_baseline_evaluation_not_transposed.csv")
    sub_evaluation = df_sub_eval.pivot(index="id", columns="d", values="pred").reset_index(drop=False)
    sub_evaluation.columns = ["id"] + [f"F{i}" for i,_ in enumerate(range(DAYS),1)]
    sub_evaluation.to_csv(SUBMISSION_BASE_PATH + 'submission_baseline_evaluation.csv')

In [39]:
store_total_result = True
if store_total_result:
    s_v = pd.read_csv(SUBMISSION_BASE_PATH + 'submission_baseline_validation.csv')
    s_e = pd.read_csv(SUBMISSION_BASE_PATH + 'submission_baseline_evaluation.csv')
    pd.concat([sub_validation, sub_evaluation]) \
        .to_csv(SUBMISSION_BASE_PATH + 'submission_baseline_total.csv', index=False)

### Validating WRMSSE for Validation Set

In [46]:
# for validation, combine 'training' df with 'prediction' df
df_sub_merged = pd.merge(
    df_val_after_release,
    df_sub_val,
    how = 'outer',
    on = ['id', 'd']
).reset_index(drop=True)
df_sub_merged['item_id'] = df_sub_merged['item_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[0:3])))
df_sub_merged['dept_id'] = df_sub_merged['dept_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[0:2])))
df_sub_merged['cat_id'] = df_sub_merged['cat_id'].fillna(df_sub_merged['id'].apply(lambda x: x.split('_')[0]))
df_sub_merged['store_id'] = df_sub_merged['store_id'].fillna(df_sub_merged['id'].apply(lambda x: '_'.join(x.split('_')[3:5])))
df_sub_merged['state_id'] = df_sub_merged['state_id'].fillna(df_sub_merged['id'].apply(lambda x: x.split('_')[3]))

In [None]:
# merge 'true' sold values on dataframe, even for the 'out of sample' ones and sort for safety again
df_sub_merged = merge_eval_sold_on_df(df_sub_merged, df_eval = df_eval)
df_sub_merged = sort_df_on_d(df_sub_merged)

In [14]:
# compute WRMSSE
df_sub_merged['pred'] = df_sub_merged['pred'].apply(lambda x: max(x,0))
WRMSSE(df_sub_merged)

2023-07-31 19:12:04 - utils - INFO - level: Level1 - RMSSE list: [('Total', 'X', 0.6630178005035814)]
2023-07-31 19:12:04 - utils - INFO - Level1 - 0.6630178005035814
2023-07-31 19:12:06 - utils - INFO - level: Level2 - RMSSE list: [['CA', 'X', 0.4824056931072565], ['TX', 'X', 0.6998223583306447], ['WI', 'X', 0.957890724778171]]
2023-07-31 19:12:06 - utils - INFO - Level2 - 0.6780528415625248
2023-07-31 19:12:08 - utils - INFO - level: Level3 - RMSSE list: [['CA_1', 'X', 0.47438266430856135], ['CA_2', 'X', 0.6614199084301389], ['CA_3', 'X', 0.40983482603555027], ['CA_4', 'X', 0.7698914285999329], ['TX_1', 'X', 0.6254002765839848], ['TX_2', 'X', 0.5845279734108768], ['TX_3', 'X', 0.9450977345740841], ['WI_1', 'X', 0.4823076091795515], ['WI_2', 'X', 1.4590678011604579], ['WI_3', 'X', 0.7948865647705697]]
2023-07-31 19:12:08 - utils - INFO - Level3 - 0.7146229122776653
2023-07-31 19:12:10 - utils - INFO - level: Level4 - RMSSE list: [['FOODS', 'X', 0.7968775583873663], ['HOBBIES', 'X', 0.

0.7757841418071817