In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.utils import _down_cast, data_preprocessing, diff_lists, log_status, ensemble_submissions_uncertainty
from utils.metrics import WSPL, DM_test_pinball
from utils.configure_logger import configure_logger
from utils.utils import prefixes_in_column
from utils import constants

configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [2]:
DATA_BASE_PATH = constants.DATA_BASE_PATH
DATA_BASE_PATH_UNCERTAINTY = constants.DATA_BASE_PATH_UNCERTAINTY
SALES_EVALUATION = constants.SALES_EVALUATION 
SALES_VALIDATION = constants.SALES_VALIDATION
CALENDAR = constants.CALENDAR 
SAMPLE_SUBMISSION = constants.SAMPLE_SUBMISSION 
SELL_PRICES = constants.SELL_PRICES

PRECOMPUTED_BASE_PATH = constants.PRECOMPUTED_BASE_PATH #'../data/uncertainty/features/'

DAYS: int = constants.DAYS #28
QUANTILES: int = constants.QUANTILES 

AGG_LEVEL_COLUMNS = constants.AGG_LEVEL_COLUMNS
D_CROSS_VAL_START_LIST = constants.D_CROSS_VAL_START_LIST

# to simple get the precomputed name
precomputed_name = lambda store, eval_val: f'processed_{store}_{eval_val}.pkl'

TEST_PATH = constants.TEST_PATH#'test/'
PREDICTION_BASE_PATH = constants.PREDICTION_BASE_PATH
SUBMISSION_BASE_PATH = constants.SUBMISSION_BASE_PATH

SUB_D_START_VAL: int = constants.SUB_D_START_VAL
SUB_D_START_EVAL: int = constants.SUB_D_START_EVAL

# the columns are always included after feature processing
# because they are required in the training and submission format
DROP_FEATURE_COLUMNS: list = constants.DROP_FEATURE_COLUMNS #['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']

In [3]:
def read_concat_predictions(fold_name: int, exclude_columns: list = [], include_columns: list = [], sparse = False, use_all = False, load_submissions_path: str = 'temp_submissions/'):
    """ 
    For specified fold, read the predictions for all aggregation levels 
    and stack them together in one dataframe.
    """
    # D_CV_START_LIST
    # if fold_name not in D_CV_START_LIST:
        # raise ValueError('fold_name must be a value in D_CV_START_LIST')
        
    exclude_columns = '_'.join(exclude_columns)
    if exclude_columns == '':
        exclude_columns = 'None'

    logger.info('loading files under path:' + f'../data/uncertainty/fold_{fold_name}/' + load_submissions_path)

    TEST_NUMB = 0
    TEST_NUMBER = 9

    dfs: list = []
    for level in list(AGG_LEVEL_COLUMNS.keys())[TEST_NUMB:TEST_NUMBER]:
        agg_columns = AGG_LEVEL_COLUMNS[level]
        group_names = '_'.join(agg_columns)
        if group_names == '':
            group_names = 'Total_X'
        
        file_path = f'../data/uncertainty/fold_{str(fold_name)}/' + load_submissions_path 
        file_path += f'lgb_val_nt_{group_names}_'
        if use_all:
            file_path += f'use_all.csv'  
        elif include_columns == None:
            file_path += f'exclude_{"_".join(exclude_columns)}.csv'            
        elif isinstance(include_columns, list):
            file_path += f'include_{"_".join(include_columns)}.csv'
        
        dfs.append(file_path)
    return ensemble_submissions_uncertainty(dfs)

In [4]:
def perform_cv(df: pd.DataFrame, df_sub1: pd.DataFrame, df_sub2: pd.DataFrame):
    
    # to be able to merge
    df_sub1['id_merge'] = df_sub1['id']\
        .apply(lambda x: x.split('.')[0])
    df_sub1['quantile'] = df_sub1['id']\
        .apply(
            lambda x: float(
                '.'.join([
                x.split('.')[-2], 
                x.split('.')[-1].split('_')[0]
                ])
            )
        )
    df_sub2['id_merge'] = df_sub2['id']\
        .apply(lambda x: x.split('.')[0])
    df_sub2['quantile'] = df_sub2['id']\
        .apply(
            lambda x: float(
                '.'.join([
                x.split('.')[-2], 
                x.split('.')[-1].split('_')[0]
                ])
            )
        )
    df_sub = pd.merge(
        df_sub1,
        df_sub2,
        how = 'inner',
        on = ['d', 'id_merge', 'quantile', 'id']
    )

    # merge predictions in cv template
    p = pd.merge(
        df,
        df_sub,
        how='right',
        on=['id_merge', 'd',]
    )
    # del df; del df_sub_val
    p['id_merge'] = p['id_merge'].astype(str)

    for c in ['sold', 'revenue']:
        p[c] = p[c].astype(np.float32)
    return p

In [5]:
# define experiments
EXPERIMENTS_DICT = {
    "seasonal": {
        "BASE": [],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['seasonal_weekday','auto_sold_ewm'],
            ['seasonal_monthday','auto_sold_ewm'],
            ['seasonal_weekday','seasonal_monthday','auto_sold_ewm'],
            ['seasonal','auto_sold_ewm'],
        ]
    },
    "state vs. store": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['state_id',],
            ['store_id',],
            ['state_id', 'store_id']
        ]
    },
    "ewm vs. ma": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['auto_sold_ma'],
            ['auto_sold_ewm', 'auto_sold_ma'],
        ]
    },
    "quantiles vs. std": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['auto_sold_qtile'],
            ['auto_sold_std'],
            ['auto_sold_qtile','auto_sold_std'],   
        ]
    },
    "price auto/momentum": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['price_auto_std'],
            ['price_momentum'],
            ['price_uncond'],
            ['price_auto_std', 'price_momentum'],
            ['price_auto_std', 'price_momentum', 'price_uncond']
        ]
    },
    "best models": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'state_id', 'store_id'],
            ['auto_sold_ma', 'auto_sold_std', 'state_id', 'store_id'],
        ]
    },
    "full vs. sparse ma" : {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id']
        ]
    },
    "sparse vs. kbest": {
        "BASE": ['seasonal', 'state_id', 'store_id'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold', 'price', 'kbest'],
            ['auto_sold_ewm_112', 'auto_sold_ewm_28',
             'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 
             'auto_sold_qtile_28_0.9',],
        ]
    },
    'full vs. sparse': {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            # ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id'],
            ['auto_sold_std_3', 'auto_sold_std_56', 'auto_sold_std_168', 
            'auto_sold_ma_7',  'auto_sold_ma_28', 'auto_sold_ma_56', 
            'auto_sold_qtile_28_0.25', 'auto_sold_qtile_168_0.25', 'auto_sold_qtile_56_0.1', 
            'state_id', 'store_id'],
            ['state_id', 'store_id', 'auto_sold_ewm_112', 'auto_sold_ewm_28',
             'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 
             'auto_sold_qtile_28_0.9',],
        ]
    },
}

In [6]:
# select experiment
experiment_spec = EXPERIMENTS_DICT['seasonal']
include_columns_list = []
base = experiment_spec['BASE']
for experiment in experiment_spec['INCLUDE_COLUMNS_LIST']:
    include_columns_list.append(base + experiment)

# load 2 submissions
include_columns = include_columns_list[0]
sub1 = pd.concat(
    [
        read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
        for d_cv_start in D_CROSS_VAL_START_LIST
    ]
).reset_index(drop=True)

include_columns = include_columns_list[4]
sub2 = pd.concat(
    [
        read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
        for d_cv_start in D_CROSS_VAL_START_LIST
    ]
).reset_index(drop=True)

2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/
2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-23 16:26:57 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-23 16:26:58 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-23 16:26:58 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-23 16:26:58 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/t

In [7]:
# load true sales values
# these variables are used later on
FORCE_RELOAD = False
try:
    # simple code to check if variable exists
    d_int + 1
    if FORCE_RELOAD:
        raise Exception()
except:
    # if not, load again
    # takes about 2-3 minutes to reload and parse
    # not the most beautiful method but it works
    d = pd.read_parquet('../data/uncertainty/cv_template/temp.parquet')
    try:
        d_int = pd.read_parquet('../data/uncertainty/cv_template/temp_d_int.parquet')['d_int']
    except:
        d_int = d['d'].apply(lambda x: int(x.split('_')[1]))
        d_int.to_frame('d_int').to_parquet('../data/uncertainty/cv_template/temp_d_int.parquet', index = False)

In [8]:
final_df = perform_cv(d, sub1, sub2)

In [9]:
final_df

Unnamed: 0,Level,agg_column1,agg_column2,d,sold,revenue,id_merge,id,pred_x,quantile,pred_y
0,Level8,CA_1,FOODS,d_1802,2779.0,8069.169922,CA_1_FOODS,CA_1_FOODS.0.005_validation,1500.0,0.005,1602.0
1,Level8,CA_1,FOODS,d_1803,2342.0,6562.270020,CA_1_FOODS,CA_1_FOODS.0.005_validation,1499.0,0.005,1587.0
2,Level8,CA_1,FOODS,d_1804,2197.0,6205.859863,CA_1_FOODS,CA_1_FOODS.0.005_validation,1499.0,0.005,1587.0
3,Level8,CA_1,FOODS,d_1805,2395.0,6849.919922,CA_1_FOODS,CA_1_FOODS.0.005_validation,1499.0,0.005,1584.0
4,Level8,CA_1,FOODS,d_1806,2863.0,7845.009766,CA_1_FOODS,CA_1_FOODS.0.005_validation,1499.0,0.005,1592.0
...,...,...,...,...,...,...,...,...,...,...,...
194035,Level2,WI,X,d_1937,11043.0,31697.929688,WI_X,WI_X.0.995_validation,18830.0,0.995,14456.0
194036,Level2,WI,X,d_1938,11504.0,33686.378906,WI_X,WI_X.0.995_validation,18830.0,0.995,14584.0
194037,Level2,WI,X,d_1939,12819.0,37480.449219,WI_X,WI_X.0.995_validation,18830.0,0.995,15260.0
194038,Level2,WI,X,d_1940,14734.0,42004.531250,WI_X,WI_X.0.995_validation,18800.0,0.995,18990.0


In [30]:
r = DM_test_pinball(final_df, 18, p_crit=0.1)
fd = final_df.copy()
fd['ids'] = fd['id_merge']
group_by = ['ids']
# r = pd.merge(
#     r,
#     fd[group_by + ['revenue']],
#     on = group_by,
#     how = 'left'
# ).reset_index(drop=False)
# print(r.head(50))
print(f"mean_stat: {r['stats'].mean()} - mean_rejected: {r['h0_rejected'].mean()} - nan-pvalues: {r['p_values'].isna().mean()}")

mean_stat: -1.561032296832639 - mean_rejected: 0.6623376623376623 - nan-pvalues: 0.18181818181818182


In [31]:
r.sort_values('level').head(50)

Unnamed: 0,level,ids,stats,p_values,h0_rejected
109,Level1,Total_X,-0.163425,0.87042,False
153,Level2,WI_X,-2.639611,0.00925,True
108,Level2,TX_X,,,True
54,Level2,CA_X,-1.861461,0.06479,True
97,Level3,TX_3_X,-3.762471,0.00025,True
75,Level3,TX_1_X,,,True
32,Level3,CA_3_X,-8.250498,0.0,True
21,Level3,CA_2_X,,,True
43,Level3,CA_4_X,-0.471196,0.63824,False
120,Level3,WI_1_X,,,True
