In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils.utils import _down_cast, data_preprocessing, diff_lists, log_status, ensemble_submissions_uncertainty
from utils.configure_logger import configure_logger
from utils.utils import prefixes_in_column
from utils import constants

configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [2]:
DATA_BASE_PATH = constants.DATA_BASE_PATH
DATA_BASE_PATH_UNCERTAINTY = constants.DATA_BASE_PATH_UNCERTAINTY
SALES_EVALUATION = constants.SALES_EVALUATION 
SALES_VALIDATION = constants.SALES_VALIDATION
CALENDAR = constants.CALENDAR 
SAMPLE_SUBMISSION = constants.SAMPLE_SUBMISSION 
SELL_PRICES = constants.SELL_PRICES

PRECOMPUTED_BASE_PATH = constants.PRECOMPUTED_BASE_PATH #'../data/uncertainty/features/'

DAYS: int = constants.DAYS #28
QUANTILES: int = constants.QUANTILES 

AGG_LEVEL_COLUMNS = constants.AGG_LEVEL_COLUMNS
D_CROSS_VAL_START_LIST = constants.D_CROSS_VAL_START_LIST

# to simple get the precomputed name
precomputed_name = lambda store, eval_val: f'processed_{store}_{eval_val}.pkl'

TEST_PATH = constants.TEST_PATH#'test/'
PREDICTION_BASE_PATH = constants.PREDICTION_BASE_PATH
SUBMISSION_BASE_PATH = constants.SUBMISSION_BASE_PATH

SUB_D_START_VAL: int = constants.SUB_D_START_VAL
SUB_D_START_EVAL: int = constants.SUB_D_START_EVAL

# the columns are always included after feature processing
# because they are required in the training and submission format
DROP_FEATURE_COLUMNS: list = constants.DROP_FEATURE_COLUMNS #['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']

In [3]:
def read_concat_predictions(fold_name: int, exclude_columns: list = [], include_columns: list = [], sparse = False, use_all = False, load_submissions_path: str = 'temp_submissions/'):
    """ 
    For specified fold, read the predictions for all aggregation levels 
    and stack them together in one dataframe.
    """
    # D_CV_START_LIST
    # if fold_name not in D_CV_START_LIST:
        # raise ValueError('fold_name must be a value in D_CV_START_LIST')
        
    exclude_columns = '_'.join(exclude_columns)
    if exclude_columns == '':
        exclude_columns = 'None'

    logger.info('loading files under path:' + f'../data/uncertainty/fold_{fold_name}/' + load_submissions_path)

    TEST_NUMB = 0
    TEST_NUMBER = 9

    dfs: list = []
    for level in list(AGG_LEVEL_COLUMNS.keys())[TEST_NUMB:TEST_NUMBER]:
        agg_columns = AGG_LEVEL_COLUMNS[level]
        group_names = '_'.join(agg_columns)
        if group_names == '':
            group_names = 'Total_X'
        
        file_path = f'../data/uncertainty/fold_{str(fold_name)}/' + load_submissions_path 
        file_path += f'lgb_val_nt_{group_names}_'
        if use_all:
            file_path += f'use_all.csv'  
        elif include_columns == None:
            file_path += f'exclude_{"_".join(exclude_columns)}.csv'            
        elif isinstance(include_columns, list):
            file_path += f'include_{"_".join(include_columns)}.csv'
        
        dfs.append(file_path)
    return ensemble_submissions_uncertainty(dfs)

In [4]:
def perform_cv(df: pd.DataFrame, df_sub1: pd.DataFrame, df_sub2: pd.DataFrame):
    
    # to be able to merge
    df_sub1['id_merge'] = df_sub1['id']\
        .apply(lambda x: x.split('.')[0])
    df_sub1['quantile'] = df_sub1['id']\
        .apply(
            lambda x: float(
                '.'.join([
                x.split('.')[-2], 
                x.split('.')[-1].split('_')[0]
                ])
            )
        )
    df_sub2['id_merge'] = df_sub2['id']\
        .apply(lambda x: x.split('.')[0])
    df_sub2['quantile'] = df_sub2['id']\
        .apply(
            lambda x: float(
                '.'.join([
                x.split('.')[-2], 
                x.split('.')[-1].split('_')[0]
                ])
            )
        )
    df_sub = pd.merge(
        df_sub1,
        df_sub2,
        how = 'inner',
        on = ['d', 'id_merge', 'quantile', 'id']
    )

    # merge predictions in cv template
    p = pd.merge(
        df,
        df_sub,
        how='right',
        on=['id_merge', 'd',]
    )
    # del df; del df_sub_val
    p['id_merge'] = p['id_merge'].astype(str)

    for c in ['sold', 'revenue']:
        p[c] = p[c].astype(np.float32)
    return p

In [5]:
# define experiments
EXPERIMENTS_DICT = {
    "seasonal": {
        "BASE": [],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['seasonal_weekday','auto_sold_ewm'],
            ['seasonal_monthday','auto_sold_ewm'],
            ['seasonal_weekday','seasonal_monthday','auto_sold_ewm'],
            ['seasonal','auto_sold_ewm'],
        ]
    },
    "state vs. store": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['state_id',],
            ['store_id',],
            ['state_id', 'store_id']
        ]
    },
    "ewm vs. ma": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['auto_sold_ma'],
            ['auto_sold_ewm', 'auto_sold_ma'],
        ]
    },
    "quantiles vs. std": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['auto_sold_qtile'],
            ['auto_sold_std'],
            ['auto_sold_qtile','auto_sold_std'],   
        ]
    },
    "price auto/momentum": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['price_auto_std'],
            ['price_momentum'],
            ['price_uncond'],
            ['price_auto_std', 'price_momentum'],
            ['price_auto_std', 'price_momentum', 'price_uncond']
        ]
    },
    "best models": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'state_id', 'store_id'],
            ['auto_sold_ma', 'auto_sold_std', 'state_id', 'store_id'],
        ]
    },
    "full vs. sparse ma" : {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id']
        ]
    },
    "sparse vs. kbest": {
        "BASE": ['seasonal', 'state_id', 'store_id'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold', 'price', 'kbest'],
            ['auto_sold_ewm_112', 'auto_sold_ewm_28',
             'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 
             'auto_sold_qtile_28_0.9',],
        ]
    },
    'full vs. sparse': {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            # ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id'],
            ['auto_sold_std_3', 'auto_sold_std_56', 'auto_sold_std_168', 
            'auto_sold_ma_7',  'auto_sold_ma_28', 'auto_sold_ma_56', 
            'auto_sold_qtile_28_0.25', 'auto_sold_qtile_168_0.25', 'auto_sold_qtile_56_0.1', 
            'state_id', 'store_id'],
            ['state_id', 'store_id', 'auto_sold_ewm_112', 'auto_sold_ewm_28',
             'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 
             'auto_sold_qtile_28_0.9',],
        ]
    },
}

In [89]:
# load true sales values
# these variables are used later on
FORCE_RELOAD = False
try:
    # simple code to check if variable exists
    d_int + 1
    if FORCE_RELOAD:
        raise Exception()
except:
    # if not, load again
    # takes about 2-3 minutes to reload and parse
    # not the most beautiful method but it works
    d = pd.read_parquet('../data/uncertainty/cv_template/temp.parquet')
    try:
        d_int = pd.read_parquet('../data/uncertainty/cv_template/temp_d_int.parquet')['d_int']
    except:
        d_int = d['d'].apply(lambda x: int(x.split('_')[1]))
        d_int.to_frame('d_int').to_parquet('../data/uncertainty/cv_template/temp_d_int.parquet', index = False)

In [156]:
# select experiment
experiment_spec = EXPERIMENTS_DICT['sparse vs. kbest']
include_columns_list = [experiment_spec['BASE'] + include_columns
    for include_columns in experiment_spec['INCLUDE_COLUMNS_LIST']]
for i, include_columns in enumerate(include_columns_list):
        if 'kbest' in include_columns:
                include_columns_list[i] = ['k_best']

# load 2 submissions
print('first model: ' + str(include_columns_list[0]))
include_columns = include_columns_list[0]
sub1 = pd.concat([
        read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
        for d_cv_start in D_CROSS_VAL_START_LIST]).reset_index(drop=True)

print('second model: ' + str(include_columns_list[1]))
include_columns = include_columns_list[1]
sub2 = pd.concat([
        read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
        for d_cv_start in D_CROSS_VAL_START_LIST]).reset_index(drop=True)

final_df = perform_cv(d, sub1, sub2)

2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/


first model: ['k_best']


2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-26 19:51:38 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-26 19:51:39 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-26 19:51:39 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-26 19:51:39 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/


second model: ['seasonal', 'state_id', 'store_id', 'auto_sold_ewm_112', 'auto_sold_ewm_28', 'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 'auto_sold_qtile_28_0.9']


2023-12-26 19:51:39 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/


In [174]:
def DM_test_pinball(df, h, p_crit: float = 0.05):
    quantile = df['quantile']
    #
    resid_x = df['sold'] - df['pred_x']
    idx = resid_x >= 0
    pinball_resid_x = resid_x
    pinball_resid_x[idx] = resid_x[idx] * quantile[idx]
    pinball_resid_x[~idx] = resid_x[~idx] * (quantile[~idx]-1)
    #
    resid_y = df['sold'] - df['pred_y']
    idx = resid_y >= 0
    pinball_resid_y = resid_y
    pinball_resid_y[idx] = resid_y[idx] * quantile[idx]
    pinball_resid_y[~idx] = resid_y[~idx] * (quantile[~idx]-1)
    #
    df['pinball_resid'] = pinball_resid_x - pinball_resid_y
    #
    if (pinball_resid_y < 0).sum() > 0 or (pinball_resid_x < 0).sum() > 0:
        print('negative residuals')
    # df['resid_x'] = resid_x
    # df['resid_y'] = resid_y
    # a = df.groupby(['quantile', 'id_merge']).agg(
    #     {
    #         'resid_x': np.mean,
    #         'resid_y': np.mean,
    #         'pinball_resid': np.mean
    #     }
    # ).reset_index(drop=False)
    # print(a[a.id_merge == 'FOODS_1_X'])
    #
    agg_dict = {
        'revenue': 'last',
        'pinball_resid': 'mean',
        'Level': 'last'
    }
    df_qtile_avg = df.groupby(['d', 'id_merge']).agg(agg_dict).reset_index(drop=False)

    ids = []
    stats = []
    p_values = []
    levels = []
    h0_rejected = []
    for id_merge, df_s in df_qtile_avg.groupby('id_merge'):
        # if id_merge != 'Total_X': continue
        
        # compute cov
        p_s = df_s['pinball_resid']
        mean = p_s.mean()
        T = len(p_s)
        
        def auto_cov(resid, lag, mean):
            resid = list(resid)
            cov = 0
            T = float(len(resid))
            for i in np.arange(0, len(resid)-lag):
                cov += ((resid[i+lag])-mean)*(resid[i]-mean)
            return (1/(T))*cov
        
        gamma = []
        for lag in range(h):
            gamma.append(auto_cov(p_s, lag, mean))
        
        # compute stat
        V_d = (gamma[0] + 2*sum(gamma[1:]))/T
        DM_stat=V_d**(-0.5)*mean
        harvey_adj=( ( T+1-2*h+h*(h-1)/T) / T ) ** 0.5
        # print(harvey_adj)
        DM_stat = harvey_adj*DM_stat

        # compute p_value
        from scipy.stats import t
        p_value = 2*t.cdf(-abs(DM_stat), df = T - 1)
        
        # store results
        levels.append(df_s['Level'].iloc[0])
        ids.append(id_merge)
        stats.append(DM_stat)
        p_values.append(round(p_value,5))
        h0_rejected.append(True if p_value < p_crit else (True if pd.isna(p_value) else False))
        
    return pd.DataFrame({
        'level': levels,
        'ids': ids,
        'stats': stats,
        'p_values': p_values,
        'h0_rejected': h0_rejected
    })

In [154]:
r = DM_test_pinball(final_df, 17, p_crit=0.1)
fd = final_df.copy()
fd['ids'] = fd['id_merge']
group_by = ['ids']
print(f"mean_stat: {r['stats'].mean()} - mean_rejected: {r['h0_rejected'].mean()} - nan-pvalues: {r['p_values'].isna().mean()}")

mean_stat: 7.638706196793833 - mean_rejected: 0.8181818181818182 - nan-pvalues: 0.006493506493506494


In [155]:
# r.sort_values('p_values').tail(50)
r.sort_values('level').head(50)
# avg stat positive >> second model performs better
# avg stat negative >> first model performs better

Unnamed: 0,level,ids,stats,p_values,h0_rejected
109,Level1,Total_X,1.405514,0.1621,False
153,Level2,WI_X,3.091734,0.00241,True
108,Level2,TX_X,-0.075447,0.93997,False
54,Level2,CA_X,2.161587,0.03236,True
97,Level3,TX_3_X,2.738737,0.00698,True
75,Level3,TX_1_X,0.170987,0.86448,False
32,Level3,CA_3_X,-0.471191,0.63824,False
21,Level3,CA_2_X,10.755763,0.0,True
43,Level3,CA_4_X,1.648623,0.10148,False
120,Level3,WI_1_X,3.218759,0.0016,True


In [175]:
def perform_full_DM_test(include_columns_list):
    # load 2 submissions
    include_columns = include_columns_list[0]
    sub1 = pd.concat([
            read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
            for d_cv_start in D_CROSS_VAL_START_LIST]).reset_index(drop=True)
    include_columns = include_columns_list[1]
    sub2 = pd.concat([
            read_concat_predictions(d_cv_start, exclude_columns=[], include_columns=include_columns) 
            for d_cv_start in D_CROSS_VAL_START_LIST]).reset_index(drop=True)
    final_df = perform_cv(d, sub1, sub2)
    return DM_test_pinball(final_df, 7, p_crit=0.1)

In [183]:
res = {}
for experiment_name, experiment_spec in EXPERIMENTS_DICT.items():
    res[experiment_name] = {}
    base = experiment_spec['BASE']
    include_columns_list = [base + include_columns for include_columns in experiment_spec['INCLUDE_COLUMNS_LIST']]
    for i, include_columns in enumerate(include_columns_list):
            if 'kbest' in include_columns:
                    include_columns_list[i] = ['k_best']

    import itertools
    for incl in itertools.combinations(include_columns_list,2):
        # incl = include_columns_list[:2]
        r = perform_full_DM_test(incl)
        name = ['_'.join(i) for i in incl]
        name = ' '.join(name)
        # res[experiment_name][name] = r['h0_rejected'].mean()
        res[experiment_name][name] = r.groupby('level')['h0_rejected'].mean().mean()
        
        for k, p_value in enumerate(r['p_values'].sort_values(ascending=True)):
                if p_value <= 0.1 / (len(r['h0_rejected']) - k):
                        continue
                else:
                        print(f'{name} insignificant at {i}/{len(r["h0_rejected"])}')
                        break                      
        break
res

2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-27 14:05:13 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-27 14:05:14 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/t

auto_sold_ewm seasonal_weekday_auto_sold_ewm insignificant at 4/154


2023-12-27 14:05:23 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-27 14:05:23 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-27 14:05:23 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-27 14:05:23 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-27 14:05:24 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/
2023-12-27 14:05:24 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-27 14:05:32 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-27 14:05:32 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-27 14:05:32 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/t

seasonal_auto_sold_ma seasonal_auto_sold_ma_state_id insignificant at 3/154


2023-12-27 14:05:32 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/
2023-12-27 14:05:33 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1802/temp_submissions/
2023-12-27 14:05:33 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1830/temp_submissions/
2023-12-27 14:05:33 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1858/temp_submissions/
2023-12-27 14:05:33 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1886/temp_submissions/
2023-12-27 14:05:33 - __main__ - INFO - loading files under path:../data/uncertainty/fold_1914/temp_submissions/


KeyboardInterrupt: 

In [182]:
for experiment_name, experiment_results in res.items():
    print(experiment_name)
    for key, item in experiment_results.items():
        if item <= .5:
            print(key, item)

seasonal
auto_sold_ewm seasonal_monthday_auto_sold_ewm 0.24638447971781308
seasonal_weekday_auto_sold_ewm seasonal_weekday_seasonal_monthday_auto_sold_ewm 0.23209876543209873
seasonal_weekday_auto_sold_ewm seasonal_auto_sold_ewm 0.45044091710758377
seasonal_weekday_seasonal_monthday_auto_sold_ewm seasonal_auto_sold_ewm 0.4964726631393297
state vs. store
ewm vs. ma
seasonal_auto_sold_ewm seasonal_auto_sold_ma 0.4125220458553792
seasonal_auto_sold_ewm seasonal_auto_sold_ewm_auto_sold_ma 0.4181657848324515
seasonal_auto_sold_ma seasonal_auto_sold_ewm_auto_sold_ma 0.3703703703703704
quantiles vs. std
seasonal_auto_sold_ma seasonal_auto_sold_ma_auto_sold_qtile 0.29964726631393296
seasonal_auto_sold_ma seasonal_auto_sold_ma_auto_sold_std 0.4425044091710758
seasonal_auto_sold_ma seasonal_auto_sold_ma_auto_sold_qtile_auto_sold_std 0.31164021164021166
seasonal_auto_sold_ma_auto_sold_qtile seasonal_auto_sold_ma_auto_sold_std 0.2641975308641975
seasonal_auto_sold_ma_auto_sold_qtile seasonal_auto_

In [173]:
temp_res

{'seasonal': {'auto_sold_ewm seasonal_weekday_auto_sold_ewm': 0.9155844155844156,
  'auto_sold_ewm seasonal_monthday_auto_sold_ewm': 0.2662337662337662,
  'auto_sold_ewm seasonal_weekday_seasonal_monthday_auto_sold_ewm': 0.9090909090909091,
  'auto_sold_ewm seasonal_auto_sold_ewm': 0.922077922077922,
  'seasonal_weekday_auto_sold_ewm seasonal_monthday_auto_sold_ewm': 0.8896103896103896,
  'seasonal_weekday_auto_sold_ewm seasonal_weekday_seasonal_monthday_auto_sold_ewm': 0.2597402597402597,
  'seasonal_weekday_auto_sold_ewm seasonal_auto_sold_ewm': 0.35064935064935066,
  'seasonal_monthday_auto_sold_ewm seasonal_weekday_seasonal_monthday_auto_sold_ewm': 0.9025974025974026,
  'seasonal_monthday_auto_sold_ewm seasonal_auto_sold_ewm': 0.9025974025974026,
  'seasonal_weekday_seasonal_monthday_auto_sold_ewm seasonal_auto_sold_ewm': 0.35714285714285715},
 'state vs. store': {'seasonal_auto_sold_ma seasonal_auto_sold_ma_state_id': 0.5584415584415584,
  'seasonal_auto_sold_ma seasonal_auto_sold