In [9]:
import os, sys, math, gc
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle as pkl
from utils.utils import merge_eval_sold_on_df, sort_df_on_d, WRMSSE, RMSSE, _down_cast, data_preprocessing, diff_lists, log_status #create_submission_df
from utils.utils import customIter, parse_columns_to_string
from utils import constants

from utils.configure_logger import configure_logger
configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [10]:
DATA_BASE_PATH = constants.DATA_BASE_PATH #'../data/m5-forecasting-accuracy/'
DATA_BASE_PATH_UNCERTAINTY = constants.DATA_BASE_PATH_UNCERTAINTY #'../data/m5-forecasting-uncertainty/'
SALES_EVALUATION = constants.SALES_EVALUATION #'sales_train_evaluation.csv'
SALES_VALIDATION = constants.SALES_VALIDATION #'sales_train_validation.csv'
CALENDAR = constants.CALENDAR #'calendar.csv'
SAMPLE_SUBMISSION = constants.SAMPLE_SUBMISSION #'sample_submission.csv'
SELL_PRICES = constants.SELL_PRICES #'sell_prices.csv'

PRECOMPUTED_BASE_PATH = constants.PRECOMPUTED_BASE_PATH #'../data/uncertainty/features/'

DAYS: int = constants.DAYS #28
QUANTILES: int = constants.QUANTILES #[0.005, 0.025, 0.165, 0.25, 0.50, 0.75, 0.835, 0.975, 0.995]

AGG_LEVEL_COLUMNS = constants.AGG_LEVEL_COLUMNS
D_CROSS_VAL_START_LIST = constants.D_CROSS_VAL_START_LIST #[1802, 1830, 1858, 1886, 1914]

In [11]:
# read all data
sales_validation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_VALIDATION))
# sales_evaluation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_EVALUATION))
calendar: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + CALENDAR))
# sample_submission: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SAMPLE_SUBMISSION))
sell_prices: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SELL_PRICES))

In [12]:
# aaa = pd.merge(
#     aaaaa,
#     calendar.drop(['weekday', 'date', 'month', 'year', 'wm_yr_wk'],axis=1),
#     on = 'd',
#     how = 'left'
# )

# weekday_average = aaa.groupby('weekday')['sold'].mean()

# for event_name in calendar['event_name_1'].unique():
#     r = aaa[aaa['event_name_1'] == event_name]
#     if len(r)>0:
#         for i, row in r.iterrows():
#             if abs(row['sold'] - weekday_average[row['weekday']]) > 8000 and row['sold'] != 0:
#                 print(row['d'], row['event_name_1'], row['weekday'], row['sold'], weekday_average[row['weekday']])

### Feature Engineering

#### Test Runs

In [13]:
# ~25 seconds
# 1830, 1858, 1886, 1914
def drop_days_after(df, day_threshold):
    columns_keep = [c for c in df.columns if c.split('_')[0] != 'd']
    columns_keep += [
        c for c in 
            [d for d in df.columns if d.split('_')[0] == 'd'] 
            if int(c.split('_')[1]) < day_threshold
    ]
    return df[columns_keep]

In [14]:
D_START_VAL = 1914
df, submission_idx = data_preprocessing(
    drop_days_after(sales_validation.iloc[:int(1000000)],
        day_threshold = D_START_VAL), 
    calendar,
    sell_prices
)
df = df[(df.wm_yr_wk > df.release)]
df['id'] = df['id'].astype(str)

In [17]:
@log_status
def compute_features(df: pd.DataFrame, group_columns, q: int = None, sparse_features: bool = False, agg_level: str = None):
    """
    Type of features computed:
     - autocorrelation_ - moving averages mean/std | ewm mean
     - autoquantiles_   - rolling sales quantiles
     - momentum_        - changes in price
    """
    # drop all NaT dates
    idx = df['date'].notna()
    df = df[idx]
    
    feature_columns = []
    # feature_columns += ['sell_price']
    for c in ['id', 'state_id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']:
        if c in df:
            feature_columns += [c]
            
    # to be sure
    df['month'] = df['month'].astype(int)
    
    # precomputing
    df["index"] = df.index
    df["d_int"] = df["d"].apply(lambda x: int(x.split("_")[-1]))
    df["day"] = str(df.date.dt.day)
    
    # to use groupby only once, saves time
    df_grouped = df.groupby(group_columns)
    
    ################################################
    ############### AUTOCORRELATION ################
    ################################################
    PREFIX = 'auto_'
    logger.info('Computing autocorrelation features')
    # DIRECT LAGGED VALUES
    old_columns: set = set(df.columns)
    LAG_SHIFT: int = 1
    for lag in [1, 2, 7, 14, 28, 56]:
        if sparse_features: continue
        df[PREFIX + f"sold_{lag}"] = df_grouped["sold"].shift(lag) # 1-day lag
    feature_columns += list(set(df.columns) - old_columns)
    
    # EWM/MA/STD/QUANTILE
    old_columns: set = set(df.columns)
    for i in [3,7,14,21,28,56,112,168]:
        if sparse_features and i != 28: continue
        min_periods = int(np.ceil(i ** 0.8))
        
        df[PREFIX + f'sold_ma_{i}'] = df_grouped['sold'].transform(lambda x: x.shift(LAG_SHIFT).rolling(i, min_periods).mean()).astype(np.float16)
        df[PREFIX + f'sold_std_{i}'] = df_grouped['sold'].transform(lambda x: x.shift(LAG_SHIFT).rolling(i, min_periods).std()).astype(np.float16)
        df[PREFIX + f'sold_ewm_{i}'] = df_grouped['sold']\
            .transform(lambda x: x.shift(LAG_SHIFT)
            .ewm(span=i, min_periods = min_periods)
            .mean())\
            .astype(np.float16)
            
        for quantile in [0.01, 0.1, 0.25, .5, 0.75, 0.9, 0.99]:#QUANTILES:
            if i * min(quantile, 1-quantile) >= 1:
                if sparse_features and i!=28:continue
                df[PREFIX + f'sold_qtile_{i}_{quantile}'] = df_grouped['sold'].transform(lambda x: x.shift(LAG_SHIFT).rolling(i, min_periods).quantile(quantile)).astype(np.float16)
    feature_columns += list(set(df.columns) - old_columns)
    
    # # EXPONENTIAL MOVING AVERAGES
    # old_columns: set = set(df.columns)
    # for i in [3,7,15,30,100]:
    #     if sparse_features and i<15: continue
    #     df[PREFIX + f'sold_ewm_{i}'] = df_grouped['sold']\
    #         .transform(lambda x: x.shift(LAG_SHIFT)
    #         .ewm(span=i, min_periods = int(np.ceil(i ** 0.8)))
    #         .mean())\
    #         .astype(np.float16)
    # feature_columns += list(set(df.columns) - old_columns)
    
    # # ROLLING QUANTILES
    # PREFIX = 'autoquantiles_'
    # old_columns: set = set(df.columns)
    # for quantile in [0.01, 0.1, 0.25, .5, 0.75, 0.9, 0.99]:#QUANTILES:
    #     for i in [14, 28, 56, 112]:
    #         if i * min(quantile, 1-quantile) >= 1:
    #             if sparse_features and i!=28:continue
    #             df[PREFIX + f'sold_qtile_{i}_{quantile}'] = df_grouped['sold'].transform(lambda x: x.shift(LAG_SHIFT).rolling(i).quantile(quantile)).astype(np.float16)
    # feature_columns += list(set(df.columns) - old_columns)
    
    ###############################################
    ############ PRICE AUTOCORRELATION ############ ## ONLY FOR LOWEST AGGREGATION LEVEL USABLE?
    ###############################################
    logger.info('Computing price autocorrelation features')
    
    # SIMPLY THE CURRENT PRICE
    # Level1 is univariate, unconditional features would not make sense
    if 'temp_id' not in group_columns:
        old_columns = set(df.columns)
        df_grouped_d = df.groupby(group_columns + ['d'])
        df['price_uncond_avg'] = df_grouped_d['sell_price'].transform(lambda x: x.mean()).astype(np.float32)
        df['price_uncond_std'] = df_grouped_d['sell_price'].transform(lambda x: x.std()).astype(np.float32)
        df['price_uncond_median'] = df_grouped_d['sell_price'].transform(lambda x: x.median()).astype(np.float32)
        feature_columns += list(set(df.columns) - old_columns)
    
    # type issues
    for c in ['wm_yr_wk', 'year', 'month']:
        df[c] = df[c].astype('int32')
        
    # PRICE DIFFERENCES BY WEEK/MONTH/YEAR (USSUALLY ON)
    PREFIX = 'price_momentum_'
    old_columns = set(df.columns)
    df[PREFIX + 'w'] = df['sell_price'] / df.groupby(group_columns + ['wm_yr_wk'])['sell_price'].transform(lambda x: x.shift(LAG_SHIFT)).astype(np.float32)
    df[PREFIX + 'm'] = df['sell_price'] / df.groupby(group_columns + ['year', 'month'])['sell_price'].transform(lambda x: x.shift(LAG_SHIFT)).astype(np.float32)
    df[PREFIX + 'y'] = df['sell_price'] / df.groupby(group_columns + ['year'])['sell_price'].transform(lambda x: x.shift(LAG_SHIFT)).astype(np.float32)
    feature_columns += list(set(df.columns) - old_columns)
    
    # VARIATION OF PRICES
    PREFIX = 'price_auto_'
    old_columns = set(df.columns)
    for i in [28, 56, 112]:
        min_periods = int(np.ceil(i ** 0.8))
        # df[PREFIX + f'std_{int(i)}'] = df['sell_price'] / df.groupby(group_columns + ['wm_yr_wk'])['sell_price'].transform(lambda x: x.rolling(i, min_periods).std()).astype(np.float32)
        df[PREFIX + f'std_{int(i)}'] = df_grouped['sell_price'].transform(lambda x: x.rolling(i).std()).astype(np.float16)
    feature_columns += list(set(df.columns) - old_columns)

    ################################################
    ############## SEASONAL FEATURES ###############
    ################################################
    logger.info('Encoding date features to dummies')
    # WEEK / MONTH DUMMIES
    PREFIX = 'seasonal_weekday_'
    encode_columns = ['weekday']
    old_columns = set(df.columns)
    df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='')
    feature_columns += list(set(df.columns) - old_columns)
    
    PREFIX = 'seasonal_month_'
    encode_columns =['month']
    old_columns = set(df.columns)
    df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='')
    feature_columns += list(set(df.columns) - old_columns)

    # encode day in month as well
    PREFIX = 'seasonal_monthday_'
    df['monthday'] = df['date'].dt.day.astype(int)
    encode_columns = ['monthday',]
    old_columns = set(df.columns)
    df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='')
    feature_columns += list(set(df.columns) - old_columns)
    
    ################################################
    ################ OTHER FEATURES ################
    ################################################
    if agg_level in ['Level10', 'Level11', 'Level12']:
        logger.info('Computing PCT of non-zero days')
        old_columns = set(df.columns)
        for i in [7, 14, 28, 28*2, 28*4]:
            if sparse_features and i != 28: continue
            df[f'sold_pct_nonzero_{i}'] = df_grouped['sold'].transform(lambda x: (x!=0).rolling(i).mean().shift(LAG_SHIFT)).astype(np.float16)
        feature_columns += list(set(df.columns) - old_columns)
    
    ################################################
    ############### STATE/STORE/CAT ################
    ################################################
    df = _down_cast(df)
    
    logger.info('Computing state-id dummy')
    old_columns = set(df.columns)
    
    if 'state_id' in group_columns:
        # add state dummy
        PREFIX = 'state_'
        encode_columns = ['state_id']
        state_ids = df['state_id']
        df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='', )
        df['state_id'] = state_ids
    if ('store_id' in group_columns and 'item_id' in group_columns):
        PREFIX = 'state_'
        encode_columns = ['state_id']
        df['state_id'] = df['store_id'].str.split('_').apply(lambda x: x[0])
        df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='', )
    elif 'store_id' in group_columns:
        PREFIX = 'store_'
        encode_columns = ['store_id']
        store_ids = df['store_id']
        df = pd.get_dummies(df, columns = encode_columns, prefix=PREFIX, prefix_sep='', )
        df['store_id'] = store_ids
        
    feature_columns += list(set(df.columns) - old_columns)
    
    ################################################
    ############## TARGET ENGINEERING ##############
    ################################################
    # the forecasting period is 28 days. We do not want to do recursive forecasting or something similar. 
    # Instead, we pass the days of forecasting as a feature in the model
    # For the training data, this forecasting period is randomized
    # For the validation and evaluation period, these are of course fixed
    
    mapping_dict = {int(D_START_VAL+i):i for i in range(DAYS)}
    def map_d_to_days_fwd(row):
        return mapping_dict.get(row['d_int'], row['days_fwd'])

    # add random forecasting periods
    days_forecast = np.random.randint(low = 0, high=DAYS, size = len(df))
    
    # create df with solely the target and date
    columns_temp = ['sold', 'd']\
        + ['seasonal_weekday_' + day for day in ['Monday', 'Tuesday', 'Thursday', 'Wednesday', 'Friday', 'Saturday', 'Sunday']]\
        + ['seasonal_month_' + str(m) for m in range(1,12+1)]\
        + ['seasonal_monthday_' + str(i) for i in range(1,31+1)]\
        + ['price_momentum_'+i for i in ['w', 'm', 'y']]\
        + ['price_auto_std_'+str(i) for i in [28, 56, 112]]
    # price_auto_
    # price_momentum_

    df_temp: pd.DataFrame = df[columns_temp + group_columns + ['d_int']]
    df_temp['d_int'] = df_temp['d_int'].astype(int)
    df_temp['days_fwd'] = days_forecast
    # for the validation set, we know the forecasting period. Specifically map these increasingly from 0 to DAYS-1
    df_temp['days_fwd'] = df_temp.apply(map_d_to_days_fwd, axis=1)
    # days_fwd = pd.to_timedelta(days_forecast, unit='D')
    df_temp['d_int'] -= df_temp['days_fwd']
    df_temp['d_int'] = df_temp['d_int'].astype(int)
    df_temp['d'] = 'd_' + df_temp['d_int'].astype(str)
    feature_columns += ['days_fwd']
    
    # merge shifted sales back
    df = df.drop(columns_temp, axis=1)
    df = pd.merge(
        df,
        df_temp,
        on = ['d_int'] + group_columns,
        how = 'right'
    )
    # add DAYS back to d_int and d
    df['d_int'] += df['days_fwd']
    df['d'] = 'd_' + df['d_int'].astype(str)
    
    # drop invalid cases of forecasting windows (i.e. negative d)
    idx = df['d_int'] > DAYS
    df = df[idx]
    ############# RETURN FINAL RESULTS ############
    
    # return final results
    df = _down_cast(df)
    return df[feature_columns]

### Simple Test function if feature_engineering works

In [13]:
# total ~77 seconds
D_START_VAL = 1914
level = 'Level1'
df['temp_id'] = 'temp_id'
# agg_columns = ['cat_id', 'state_id']
agg_columns = AGG_LEVEL_COLUMNS[level] if level != 'Level1' else ['temp_id']
agg_dict = {
    'sold': np.nansum,
    'sell_price': np.nanmean,
    'date': 'last',
    'weekday': 'last',
    'month': 'last',
    'year': 'last',
    'wm_yr_wk': 'last'
}
features = compute_features(
    df.groupby(agg_columns + ['d']).agg(agg_dict).reset_index(drop=False),
    agg_columns,
    sparse_features=False,
    agg_level=level
)
# features[features['d'].isin([f'd_{int(i)}' for i in range(D_START_VAL, D_START_VAL+DAYS)])]
features

NameError: name 'df' is not defined

In [14]:
# verify that for the prediction period,
# days forward is not random but 0 to DAYS-1, increasing linearly
for i in range(DAYS):
    assert len(features[features['d'] == f'd_{int(D_START_VAL+i)}']['days_fwd'].unique())==1
    assert features[features['d'] == f'd_{int(D_START_VAL+i)}']['days_fwd'].unique()[0]==i

NameError: name 'features' is not defined

### Final Runs Feature Engineering for Validation and Evaluation

In [15]:
@log_status
def groupby_agglevel(df: pd.DataFrame, agg_columns: list, agg_dict: dict):
    return df.groupby(agg_columns).agg(agg_dict).reset_index(drop=False)

In [20]:
# params for un
SPARSE_FEATURES = False
TEST_RUN = False
MAX_QUANTILE = 3
# MAX_QUANTILE = 12 - MAX_QUANTILE

# for each fold
# 1830, 1858, 1886, 1914
for D_START_VAL in D_CROSS_VAL_START_LIST:
    logger.info(f'D_START_VAL: {D_START_VAL}')

    # if all grouped dfs for all levels is already computed
    # do not load original df at all
    all_found = True
    for agg_level in AGG_LEVEL_COLUMNS:
        try:
            aaaaa = pd.read_parquet(f'../data/uncertainty/fold_{int(D_START_VAL)}/grouped/grouped_{agg_level}.parquet')
        except:
            all_found = False
            logger.info(f'grouped df not computed for level: {agg_level}')
        
    if not all_found:
        # pivot initial dataframe and compute features/targets
        df_val, submission_idx_validation = data_preprocessing(
            drop_days_after(sales_validation,
            day_threshold = D_START_VAL), 
            calendar,
            sell_prices
        )
        # drop all leading rows with leading zeros for each product
        df_val_after_release = df_val[(df_val.wm_yr_wk > df_val.release)]
        del df_val
    else:
        logger.info('grouped df computed for all levels, original dataframe is not loaded')

    # set prediction values to nan values
    # pred_index = df_val_after_release['d'].isin(D_CV_OOS)
    # df_val_after_release.loc[pred_index, 'sold'] = np.nan
    # del pred_index

    # for each level
    for agg_level, agg_columns in AGG_LEVEL_COLUMNS.items(): 
        # remove index to compute all
        if agg_level in [f'Level{int(12 - i)}' for i in range(0,MAX_QUANTILE)]:
            continue
            
        # group data for specific grouping columns per level
        # and compute features
        agg_dict = {
            'sold': np.nansum,
            'sell_price': np.nanmean,
            'date': 'last',
            'weekday': 'last',
            'month': 'last',
            'year': 'last',
            'wm_yr_wk': 'last'
        }

        if len(agg_columns) == 0:
            agg_columns = ['temp_id']
        
        # load grouped df if already exists
        try:
            aaaaa = pd.read_parquet(f'../data/uncertainty/fold_{int(D_START_VAL)}/grouped/grouped_{agg_level}.parquet')
            aaaaa = _down_cast(aaaaa)
        except:
            logger.info(f'not existing yet: ../data/uncertainty/fold_{int(D_START_VAL)}/grouped/grouped_{agg_level}.parquet')
            logger.info(f'computing and storing grouped dataframe for level: {agg_level}')
            # get data on aggregated level
            if len(agg_columns) == 0 or 'temp_id' in agg_columns:
                df_val_after_release['temp_id'] = 'temp_id'

            aaaaa = groupby_agglevel(df_val_after_release, agg_columns + ['d'], agg_dict)
            idx_keep = aaaaa['date'].notna()
            aaaaa = aaaaa[idx_keep]
            # to suitable type for .parquet
            for c in aaaaa.columns:
                if c not in agg_columns + ['d','date','weekday']:#['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'state_id']:
                    aaaaa[c] = aaaaa[c].astype(np.float32)
            aaaaa.to_parquet(f'../data/uncertainty/fold_{int(D_START_VAL)}/grouped/grouped_{agg_level}.parquet', index=False)
            aaaaa = _down_cast(aaaaa)
            
        # compute all features
        features = compute_features(
            aaaaa,
            agg_columns,
            sparse_features=SPARSE_FEATURES,
            agg_level=agg_level
        )
        
        # to suitable type for .parquet
        for c in features.columns:
            if c not in ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'state_id']:
                features[c] = features[c].astype(np.float32)
        
        # format string and save file
        agg_string = parse_columns_to_string(agg_columns)
        if not TEST_RUN:
            features.to_parquet(f'../data/uncertainty/fold_{int(D_START_VAL)}/features/' + f'features_val_{agg_string}.parquet', index=False)
            del features
        else:
            # features.to_parquet(f'../data/uncertainty/fold_{int(D_START_VAL)}/features/' + f'/test/features_val_{agg_string}.parquet', index=False)
            pass

    if not TEST_RUN:
        try:
            del df_val_after_release
        except:
            pass

2023-11-29 09:26:35 - __main__ - INFO - D_START_VAL: 1802
2023-11-29 09:26:38 - __main__ - INFO - grouped df computed for all levels, original dataframe is not loaded
2023-11-29 09:26:38 - compute_features - INFO - calling
2023-11-29 09:26:38 - __main__ - INFO - Computing autocorrelation features
2023-11-29 09:26:38 - __main__ - INFO - Computing price autocorrelation features
2023-11-29 09:26:38 - __main__ - INFO - Computing unconditional sold values
2023-11-29 09:26:38 - __main__ - INFO - Encoding date features to dummies
2023-11-29 09:26:38 - __main__ - INFO - Computing state-id dummy
2023-11-29 09:26:38 - compute_features - INFO - calling
2023-11-29 09:26:38 - __main__ - INFO - Computing autocorrelation features
2023-11-29 09:26:38 - __main__ - INFO - Computing price autocorrelation features
2023-11-29 09:26:39 - __main__ - INFO - Computing unconditional sold values
2023-11-29 09:26:39 - __main__ - INFO - Encoding date features to dummies
2023-11-29 09:26:39 - __main__ - INFO - Comp

In [17]:
# # pivot initial dataframe and compute features/targets
# df_eval, submission_idx_validation = data_preprocessing(sales_evaluation, calendar, sell_prices)
# df_eval_after_release = df_eval[(df_eval.wm_yr_wk > df_eval.release)]
# del df_eval

# # set prediction values to nan values
# pred_index = df_eval_after_release['d'].isin(D_CV_OOS)
# df_eval_after_release.loc[pred_index, 'sold'] = np.nan
# del pred_index

In [43]:
# TEST_RUN = False
# for agg_level in AGG_LEVEL_COLUMNS:
#     agg_columns = AGG_LEVEL_COLUMNS[agg_level]
#     # get data on aggregated level
#     if len(agg_columns) == 0:
#         df_eval_after_release['temp_id'] = 'temp_id'
#         agg_columns = ['temp_id']
#     agg_dict = {
#         'sold': np.nansum,
#         'date': 'last',
#         'weekday': 'last',
#         'month': 'last'
#     }
#     features = compute_features(
#         groupby_agglevel(df_eval_after_release, agg_columns + ['d'], agg_dict),
#         agg_columns
#     )
    
#     # to suitable format for .parquet
#     for c in features.columns:
#         if c not in ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'state_id']:
#             features[c] = features[c].astype(np.float32)
        
#     # format string and save file
#     agg_string = parse_columns_to_string(agg_columns)
#     if not TEST_RUN:
#         features.to_parquet(PRECOMPUTED_BASE_PATH + f'features_eval_{agg_string}.parquet', index=False)
#         del features
#     else:
#         features.to_parquet(PRECOMPUTED_BASE_PATH + f'/test/features_eval_{agg_string}.parquet', index=False)
        
# if not TEST_RUN: 
#     del df_eval_after_release