In [1]:
import pandas as pd
from utils.utils import _down_cast, data_preprocessing
from utils import constants

from utils.configure_logger import configure_logger
configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [2]:
DATA_BASE_PATH = constants.DATA_BASE_PATH #'../data/m5-forecasting-accuracy/'
DATA_BASE_PATH_UNCERTAINTY = constants.DATA_BASE_PATH_UNCERTAINTY #'../data/m5-forecasting-uncertainty/'
SALES_EVALUATION = constants.SALES_EVALUATION #'sales_train_evaluation.csv'
SALES_VALIDATION = constants.SALES_VALIDATION #'sales_train_validation.csv'
CALENDAR = constants.CALENDAR #'calendar.csv'
SAMPLE_SUBMISSION = constants.SAMPLE_SUBMISSION #'sample_submission.csv'
SELL_PRICES = constants.SELL_PRICES #'sell_prices.csv'

PRECOMPUTED_BASE_PATH = constants.PRECOMPUTED_BASE_PATH #'../data/uncertainty/features/'

DAYS: int = constants.DAYS #28
QUANTILES: int = constants.QUANTILES #[0.005, 0.025, 0.165, 0.25, 0.50, 0.75, 0.835, 0.975, 0.995]
AGG_LEVEL_COLUMNS = constants.AGG_LEVEL_COLUMNS
D_CV_START_LIST = constants.D_CROSS_VAL_START_LIST#[1802, 1830, 1858, 1886, 1914]

In [3]:
# read all data
sales_validation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_VALIDATION))
# sales_evaluation: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SALES_EVALUATION))
calendar: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + CALENDAR))
sample_submission: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SAMPLE_SUBMISSION))
sell_prices: pd.DataFrame = _down_cast(pd.read_csv(DATA_BASE_PATH + SELL_PRICES))

In [4]:
df, submission_idx = data_preprocessing(
    sales_validation,
    calendar,
    sell_prices
)
df = df[(df.wm_yr_wk > df.release)]
del sales_validation; del calendar; del sample_submission; del sell_prices

In [24]:
def compute_weights(df: pd.DataFrame, d_cv_start: int = 1914):
    """ 
    weights: pd.DataFrame
    columns = [Level, agg_column1, agg_column2, weight]
    """
    weights_df: list = list()
    df['revenue'] = df['sold'] * df['sell_price']
    
    d_cv_weights = [f'd_{i}' for i in range(d_cv_start-DAYS, d_cv_start)]
    df = df[df['d'].isin(d_cv_weights)]
    
    for agg_level in AGG_LEVEL_COLUMNS:
        logger.info(agg_level)
        agg_columns = AGG_LEVEL_COLUMNS[agg_level]
        if agg_level == 'Level1':          
            weights = pd.Series([1])
        else:
            grouped_revenue = df.groupby(agg_columns)['revenue'].sum().reset_index(drop=False, )
            weights = grouped_revenue['revenue'] / grouped_revenue['revenue'].sum()
            
        data = {
            'Level_id': agg_level,
            'Weight': weights
        }
        if agg_level == 'Level1':
            data['Agg_Level_1'] = 'Total'
            data['Agg_Level_2'] = 'X'
        elif len(agg_columns) == 1:
            data['Agg_Level_1'] = grouped_revenue[agg_columns[0]]
            data['Agg_Level_2'] = 'X'
        else:
            data['Agg_Level_1'] = grouped_revenue[agg_columns[0]]
            data['Agg_Level_2'] = grouped_revenue[agg_columns[1]]
        
        weights_df.append(pd.DataFrame(data))
    return pd.concat(weights_df)

In [26]:
for D_CV_START in D_CV_START_LIST:
    logger.info('d_cv_start: ' + str(D_CV_START))
    
    weights = compute_weights(df, D_CV_START)
    weights.to_csv(f'../data/uncertainty/fold_{D_CV_START}/' + f'weights_validation.csv')

2023-08-21 19:34:46 - __main__ - INFO - d_cv_start: 1802
2023-08-21 19:34:47 - __main__ - INFO - Level1
2023-08-21 19:34:47 - __main__ - INFO - Level2
2023-08-21 19:34:47 - __main__ - INFO - Level3
2023-08-21 19:34:47 - __main__ - INFO - Level4
2023-08-21 19:34:47 - __main__ - INFO - Level5
2023-08-21 19:34:47 - __main__ - INFO - Level6
2023-08-21 19:34:47 - __main__ - INFO - Level7
2023-08-21 19:34:47 - __main__ - INFO - Level8
2023-08-21 19:34:47 - __main__ - INFO - Level9
2023-08-21 19:34:47 - __main__ - INFO - Level10
2023-08-21 19:34:47 - __main__ - INFO - Level11
2023-08-21 19:34:47 - __main__ - INFO - Level12
2023-08-21 19:34:47 - __main__ - INFO - d_cv_start: 1830
2023-08-21 19:34:48 - __main__ - INFO - Level1
2023-08-21 19:34:48 - __main__ - INFO - Level2
2023-08-21 19:34:48 - __main__ - INFO - Level3
2023-08-21 19:34:48 - __main__ - INFO - Level4
2023-08-21 19:34:48 - __main__ - INFO - Level5
2023-08-21 19:34:48 - __main__ - INFO - Level6
2023-08-21 19:34:48 - __main__ - INFO

### Validate if weights per level sum up to 1

In [20]:
for D_CV_START in D_CV_START_LIST:
    weights = pd.read_csv(f'../data/uncertainty/fold_{D_CV_START}/weights_validation.csv')
    for id, weight_level in weights.groupby(['Level_id']):
        
        # rounding due to some decimal errors
        assert round(weight_level['Weight'].sum(), 7) == 1.