In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
import lightgbm as lgb
from utils.utils import _down_cast, data_preprocessing, diff_lists, log_status
from utils.utils import ensemble_submissions, ensemble_submissions_uncertainty
from utils.metrics import WSPL
from utils.configure_logger import configure_logger
from utils.utils import prefixes_in_column
from utils import constants

configure_logger()
from logging import getLogger
logger = getLogger(__name__)

import warnings
warnings.simplefilter("ignore")

In [2]:
DATA_BASE_PATH = constants.DATA_BASE_PATH
DATA_BASE_PATH_UNCERTAINTY = constants.DATA_BASE_PATH_UNCERTAINTY
SALES_EVALUATION = constants.SALES_EVALUATION
SALES_VALIDATION = constants.SALES_VALIDATION
CALENDAR = constants.CALENDAR
SAMPLE_SUBMISSION = constants.SAMPLE_SUBMISSION 
SELL_PRICES = constants.SELL_PRICES

PRECOMPUTED_BASE_PATH = constants.PRECOMPUTED_BASE_PATH

DAYS: int = constants.DAYS
QUANTILES: int = constants.QUANTILES 

AGG_LEVEL_COLUMNS = constants.AGG_LEVEL_COLUMNS
D_CROSS_VAL_START_LIST = constants.D_CROSS_VAL_START_LIST

# to simple get the precomputed name
precomputed_name = lambda store, eval_val: f'processed_{store}_{eval_val}.pkl'

TEST_PATH = constants.TEST_PATH#'test/'
PREDICTION_BASE_PATH = constants.PREDICTION_BASE_PATH
SUBMISSION_BASE_PATH = constants.SUBMISSION_BASE_PATH

SUB_D_START_VAL: int = constants.SUB_D_START_VAL
SUB_D_START_EVAL: int = constants.SUB_D_START_EVAL

# the columns are always included after feature processing
# because they are required in the training and submission format
DROP_FEATURE_COLUMNS: list = constants.DROP_FEATURE_COLUMNS #['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'd', 'sold']

### Define GridSearch functions

In [3]:
@log_status
def grid_search(
    params: dict,
    param_grid: dict,
    features: pd.DataFrame, 
    targets: pd.DataFrame, 
    n_folds: int = 1,
    fig: plt.Figure = None,
    ax: plt.Axes = None
    ):
    """
    Given a grid with hyperparameters, train LightGBM for all possible combinations.
    Returns the parameter set with the best score and the dictionary with all results.
    """
    import itertools
    
    # to be sure
    features = features.reset_index(drop=True)
    targets = targets.reset_index(drop=True)

    param_combinations = list(itertools.product(*param_grid.values()))
    results = {}
    
    if PLOT_EVAL and not (fig and ax):
        fig, ax = plt.subplots(1,1, figsize=(10,5))

    for i, param_combination in enumerate(param_combinations,1):
        
        # create dictionary with all parameters
        param_combination = {k:v for k,v in zip(param_grid.keys(), param_combination)}
        param_combination.update(params)
                
        # init dict
        results[f"combination_{i}"] = {
            'params': param_combination,
            'res': []
        }
        
        # perform n_folds
        for j in range(n_folds):
            
            # compute fold
            features_train, features_validation, targets_train, targets_validation =\
                train_test_split(features, targets, train_size = .8, random_state=43 if n_folds == 1 else None)

            # train lgb model
            temp_dict = {} # this dict object will be used to add all (intermediate) evaluation scores during the training process
            mod: lgb.Booster = lgb.train(param_combination, 
                train_set = lgb.Dataset(features_train, targets_train),
                valid_sets = lgb.Dataset(features_validation, targets_validation),
                evals_result = temp_dict,
                verbose_eval=False
            )

            # plot results
            if PLOT_EVAL:
                # if not ax:
                evals = temp_dict['valid_0']['quantile']
                ax.plot(range(1,len(evals)+1), np.log(evals), label = 'lr: ' + str(param_combination['learning_rate']))
                ax.set_xlim(-100, len(evals)+100)
                
            # store results
            results[f"combination_{i}"]['res']\
                .append(temp_dict["valid_0"]["quantile"][-1],
                )

        # compute average results
        results[f"combination_{i}"]['validation_score'] = \
            np.mean(results[f"combination_{i}"]['res'])
            
        # REMOVE
        p = results[f"combination_{i}"]['params']
        logger.info(f"{p['learning_rate']} - {p['num_leaves']} - {p['n_estimators']}" + ' - score: ' + str(np.mean(results[f"combination_{i}"]['res'])) + ' ' + str(np.std(results[f"combination_{i}"]['res'])))
        # REMOVE
        
    # sort the results based on evaluation score
    sorted_results = dict(sorted(results.items(), key=lambda item: item[1]["validation_score"]))
    return list(sorted_results.values())[0], results

### LoadData
This class is used to load data much faster. The class implementation prevents reloading and processing the features over and over again.

In [4]:
class LoadData:
    """ Class to load data quickly (and prevent reloading) """
    def __init__(self):
        self.level = None
        
    def prep_data(self,level, sub_d_start):
        """ read the precomputed features and targets for specified aggregation level,  """
        # define params
        agg_level = level
        # sub_d_start: int = int(1886)
        exclude_columns = []
        test = False
        type_of = 'val'

        # read file
        agg_columns = AGG_LEVEL_COLUMNS[agg_level]
        if len(agg_columns) == 0:
            agg_str: str = 'Total_X'
        elif len(agg_columns) == 1:
            agg_str: str = f'{agg_columns[0]}_X'
        else:
            agg_str: str = '_'.join(agg_columns)

        # if level already loaded
        if self.level == level:
            pass
        else:
            self.level = level
            logger.info('(re)loading features')
            features = pd.read_parquet(f'../data/uncertainty/fold_{sub_d_start}/features/' + (TEST_PATH if test else '') + f'features_{type_of}_{agg_str}.parquet')
            features = _down_cast(features)

            group_columns = agg_columns
            exclude_prefix_list = exclude_columns # unconditional, auto, momentum, seasonal
            
            features_gr = features.copy()
            features_gr = features_gr[[c for c in features_gr if c.split('_')[0] not in exclude_prefix_list]]

            # preparations
            train_idx = features_gr['sold'].notna() & features_gr['d'].isin([f'd_{sub_d_start - 1 - i}' for i in range(1460)])
            df_train = features_gr[train_idx]
            features_train: pd.DataFrame = df_train.drop(DROP_FEATURE_COLUMNS, axis = 1, errors = 'ignore')
            targets_train: pd.Series = df_train['sold']
            self.features_train = features_train
            self.targets_train = targets_train
        
    def get_prep_data(self):
        return self.features_train, self.targets_train

### Test Training a Single Model

In [5]:
# load data example, to investigate which features are computed, among other things
level = 'Level1'
dataLoader = LoadData()
dataLoader.prep_data(level, 1914)
features, targets = dataLoader.get_prep_data()
list(features.columns)

2024-01-17 16:53:52 - __main__ - INFO - (re)loading features


['auto_sold_56',
 'auto_sold_1',
 'auto_sold_2',
 'auto_sold_28',
 'auto_sold_7',
 'auto_sold_14',
 'auto_sold_qtile_168_0.75',
 'auto_sold_qtile_112_0.99',
 'auto_sold_qtile_14_0.25',
 'auto_sold_qtile_14_0.75',
 'auto_sold_ewm_56',
 'auto_sold_ewm_7',
 'auto_sold_qtile_7_0.25',
 'auto_sold_qtile_112_0.5',
 'auto_sold_qtile_14_0.1',
 'auto_sold_qtile_56_0.75',
 'auto_sold_qtile_168_0.99',
 'auto_sold_qtile_7_0.75',
 'auto_sold_std_168',
 'auto_sold_ewm_3',
 'auto_sold_qtile_56_0.25',
 'auto_sold_qtile_168_0.01',
 'auto_sold_ma_3',
 'auto_sold_qtile_14_0.5',
 'auto_sold_qtile_56_0.5',
 'auto_sold_ewm_14',
 'auto_sold_ewm_168',
 'auto_sold_ma_21',
 'auto_sold_qtile_3_0.5',
 'auto_sold_ewm_28',
 'auto_sold_qtile_21_0.75',
 'auto_sold_qtile_168_0.1',
 'auto_sold_qtile_112_0.01',
 'auto_sold_qtile_168_0.9',
 'auto_sold_qtile_28_0.75',
 'auto_sold_qtile_112_0.75',
 'auto_sold_std_3',
 'auto_sold_qtile_21_0.9',
 'auto_sold_qtile_21_0.25',
 'auto_sold_ma_168',
 'auto_sold_qtile_21_0.1',
 'aut

### Test run for training the model

In [6]:
# kfold
prefixes = ['seasonal', 'auto_sold_ewm']
features_train, features_validation, targets_train, targets_validation =\
    train_test_split(features, targets, test_size = 28, shuffle=False, random_state=42)

params = {
    'objective': 'quantile',
    # 'metric': 'quantile',
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': -100,
    'n_jobs': 4,
    "num_leaves": 30,
    "min_child_weight": .1,
    "min_child_samples": 4,
    "hist_pool_size": 1000,
    'feature_fraction': 0.9,
    "learning_rate": 0.005,
    "n_estimators": 2000,
    "max_depth": 10,
    'alpha': .25,
    'verbose': 0
}

# UNCOMMENT FOR TEST RUN OF TRAINING A MODEL FOR ALL QUANTILES    
# for q in [0.005, 0.025, 0.135, 0.25, 0.5, 0.75, 0.865, 0.975, 0.995]:
#     params['alpha'] = q 
#     temp_dict = {}
#     mod: lgb.Booster = lgb.train(params, 
#         train_set = lgb.Dataset(features_train, targets_train),
#         valid_sets = lgb.Dataset(features_validation, targets_validation),
#         evals_result = temp_dict,
#         verbose_eval = False
#     )
#     plt.plot(mod.predict(features_validation), label = f'{q}')
# plt.scatter(range(len(targets_validation.index)), targets_validation, label = 'true', s = 10)
# plt.legend()
# plt.grid()
# plt.show()

### Test Run for Grid Search

In [7]:
params = {
    'objective': 'quantile',
    # 'metric': 'quantile',
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': -1,
    'n_jobs': 4,
    'hist_pool_size': 1000,
}
param_grid = {
    'max_depth': [10],
    'n_estimators': [200, 800],
    'min_child_samples': [4],
    'min_child_weight': [0.1 ],
    'num_leaves': [30], # 50, 70, 90, ],
    'learning_rate': [0.001, 0.005, 0.01, 0.02],         
    'subsample': [ 0.9, 1],
    'subsample_freq': [1],
}

# UNCOMMENT FOR TEST RUN OF GRID SEARCH
# # test grid search for all quantiles
# for q in QUANTILES[:1]:
#     # of course, update quantile in params
#     params['alpha'] = q
#     best_res, res = grid_search(params, param_grid, features_train, targets_train, 1)
#     logger.info(best_res['params'])

#     mod = lgb.train(best_res['params'],
#         train_set = lgb.Dataset(features_train, targets_train)
#     )
#     predictions = mod.predict(features_validation)
#     plt.plot(predictions, label = str(q))

# plt.scatter(range(len(targets_validation)), targets_validation)
# plt.legend()
# plt.grid()
# plt.show()

# Train model for all quantiles for specific fold + features

In [8]:
def train_level_all_quantiles(
    agg_level: str, 
    type_of: str, 
    sub_d_start: int, 
    exclude_columns: list = [], 
    include_columns: list = None,
    test: bool = False, 
    do_grid_search: bool = False, 
    store_submissions_path: str = None, 
    normalize: bool = False,
):
    """ 
    Train, for a specific aggregation level, models for all quantiles.
    For aggregation levels 10, 11 and 12, undersampling is used to drastically reduce training time.
    Options are to first do a grid search. Also possible to select kbest features by providing 
    'kbest' in the include_columns list.
    Not the prettiest method but it works.
    """
    if PLOT_EVAL and PLOT_PREDICTIONS:
        raise ValueError('PLOT_EVAL and PLOT_PREDICTIONS cannot be both True')
    
    ALWAYS_KEEP_COLUMNS = ['days_fwd', 'sold', 'd']
    
    # transform 'level{i}' to agg columns concatenated
    agg_columns = AGG_LEVEL_COLUMNS[agg_level]
    if len(agg_columns) == 0:
        agg_str: str = 'Total_X'
    elif len(agg_columns) == 1:
        agg_str: str = f'{agg_columns[0]}_X'
    else:
        agg_str: str = '_'.join(agg_columns)

    # load feature set
    logger.info('loading features')
    features = pd.read_parquet(f'../data/uncertainty/fold_{sub_d_start}/features/' + (TEST_PATH if test else '') + f'features_{type_of}_{agg_str}.parquet')
    features = _down_cast(features)
    features_gr = features.copy()
    
    # seperate train/pred indices
    train_idx = features_gr['sold'].notna() & features_gr['d'].isin([f'd_{sub_d_start - 1 - i}' for i in range(1300)])
    pred_idx = features_gr['d'].isin([f'd_{sub_d_start + i}' for i in range(DAYS)])

    group_columns = agg_columns
    res: list = []
    
    def check_any_prefix_matches(column, prefixes):
        """ Return true if any prefix is in column """
        for prefix in prefixes:
            if prefix in column:
                return True
        return False

    # select features
    if USE_ALL or 'kbest' in include_columns:
        columns = features_gr.columns
    elif SPARSE_FEATURES:
        columns = [c for c in features_gr.columns if c in SPARSE_FEATURES]
    elif include_columns == None:
        exclude_prefix_list = exclude_columns 
        columns = [c for c in features_gr.columns if not check_any_prefix_matches(c, exclude_prefix_list)]
    elif isinstance(include_columns, list):
        include_prefix_list = include_columns
        columns = [c for c in features_gr.columns if check_any_prefix_matches(c, include_prefix_list)]

    # add always keep columns to selected features
    for column in ALWAYS_KEEP_COLUMNS + group_columns:
        if column not in columns:
            columns.append(column)
            
    # get final dataframes
    features_gr = features_gr[columns]
    df_pred = features_gr[pred_idx]
    df_train = features_gr[train_idx]
    # drop days of extremely low sales in high aggregation levels
    # this is very likely a store closure or something else
    if agg_level not in ['Level9', 'Level10', 'Level11', 'Level12']:
        df_train = df_train[df_train['sold'] >= 20]

    from copy import deepcopy
    temp_drop_feature_columns = deepcopy(DROP_FEATURE_COLUMNS)
    if not USE_ALL and 'kbest' not in include_columns:
        if 'state_id' in include_prefix_list:
            temp_drop_feature_columns.remove('state_id')
        if 'store_id' in include_prefix_list:
            temp_drop_feature_columns.remove('store_id')
    if USE_ALL or 'kbest' in include_columns:
        temp_drop_feature_columns.remove('state_id')
        temp_drop_feature_columns.remove('store_id')
        
    features_train: pd.DataFrame = df_train.drop(temp_drop_feature_columns, axis = 1, errors = 'ignore')
    targets_train: pd.Series = df_train['sold']
    features_predict: pd.DataFrame = df_pred.drop(temp_drop_feature_columns, axis = 1, errors = 'ignore')
    targets_test: pd.Series = df_pred['sold']
    
    if 'kbest' in include_columns:
        # cannot do selectkbest for category variables
        # others of these should always be kept
        exclude_from_kbest = [
            'state_id', 'store_id',
            'seasonal_weekday', 'seasonal_monthday', 'seasonal_month', 
            'days_fwd'
        ]
        temp_drop_idx = features_train.drop(exclude_from_kbest, axis=1, errors='ignore').fillna(0).notna().all(axis=1)
        from sklearn import metrics
        from sklearn import feature_selection
        fit = SelectKBest(
                k=5,
                score_func=feature_selection.f_regression
            ).fit(
                features_train.drop(exclude_from_kbest, axis=1, errors='ignore').fillna(0)[temp_drop_idx], 
                targets_train[temp_drop_idx]
            )
        # print(fit.get_feature_names_out())
        features_keep = list(fit.get_feature_names_out())
        for c in exclude_from_kbest:
            if c in features_train.columns:
                features_keep.append(c)
        print(features_keep)
        features_train = features_train[features_keep]
        features_predict = features_predict[features_keep]
    
    # undersample data
    if agg_level in undersampling_dict.keys() and HIGH_UNDERSAMPLING:
        undersampling_pct = undersampling_dict[agg_level]
        features_train, _, targets_train, _ = train_test_split(features_train, targets_train, train_size = undersampling_pct, shuffle=True, random_state=43)

    # normalise targets
    if normalize:
        logger.info('scaling targets')
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        targets_train = scaler.fit_transform(targets_train.values.reshape(-1,1))
        
    if PLOT_PREDICTIONS:
        fig, ax = plt.subplots(1,1, figsize = (10,5))
        aaa = [i for i in range(targets_test.shape[0])]
        
    # train model for all quantiles
    for quantile in QUANTILES:
        
        # perform grid search for best parameters
        if do_grid_search == True:
            # split data to training and testing
            logger.info('perform gridsearch')
            params['alpha'] = quantile
            if PLOT_EVAL:
                fig, ax = plt.subplots(1,1,figsize=(10,5))
                ax.set_title(f'LightGBM Out-of-Sample Pinball-Loss - {agg_level.capitalize()} - q: {quantile}')
            best_combination, results = grid_search(params, param_grid, features_train, targets_train, n_folds = 1, fig = fig, ax = ax)
            # del train_data; del validation_data
            params_grid_train = best_combination["params"]
            logger.info(f'q: {quantile} - cv best params: {params_grid_train}')
            
            if PLOT_EVAL:
                exclude_names = 'None' if len(include_prefix_list) == 0 else '_'.join(include_prefix_list)
                ax.set_xlabel('Number of Trained Trees')
                ax.set_ylabel('Log(Pinball-loss)')
                ax.legend()
                ax.grid()
                fig.tight_layout()
                fig.savefig('../figure/results/' + f'training_iteration_f_{sub_d_start}_include_{exclude_names}_q={quantile}.png', dpi=300)
                plt.show()
        
        else:
            params_grid_train = PARAM_GRID_TRAIN
            params_grid_train['alpha'] = quantile

        # train_best_model
        mod = lgb.train(params_grid_train,
            train_set = lgb.Dataset(features_train, targets_train)
        )
        # create filepath to store model in
        group_names = '_'.join(group_columns)
        if group_names == '':
            group_names = 'Total_X'
        if USE_ALL:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + 'models/' + f'lgb_{type_of}_nt_{group_names}_use_all_q={quantile}.joblib'
        elif 'kbest' in include_columns:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + 'models/' + f'lgb_{type_of}_nt_{group_names}_include_k_best_q={quantile}.joblib'
        elif SPARSE_FEATURES:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + 'models/' + f'lgb_{type_of}_nt_{group_names}_sparse_q={quantile}.joblib' 
        elif include_columns == None:
            exclude_names = 'None' if len(exclude_prefix_list) == 0 else '_'.join(exclude_prefix_list)
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + 'models/' + f'lgb_{type_of}_nt_{group_names}_exclude_{exclude_names}_q={quantile}.joblib'
        elif isinstance(include_columns, list):
            exclude_names = 'None' if len(include_prefix_list) == 0 else '_'.join(include_prefix_list)
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + 'models/' + f'lgb_{type_of}_nt_{group_names}_include_{exclude_names}_q={quantile}.joblib'

        # save model under filepath
        import joblib
        joblib.dump(mod, file_path)
        
        # make predictions
        predictions = mod.predict(features_predict)
        if normalize:
            predictions = scaler.inverse_transform(predictions.reshape(-1,1)).reshape(-1,)
        
        if PLOT_PREDICTIONS:
            ax.plot(aaa, predictions, label = f'{quantile}')
        
        # store predictions
        df_p = pd.DataFrame(
            {
                'pred': predictions,
                'd': df_pred['d'],
            }
        )
        df_p['quantile'] = quantile
        df_p['Level'] = agg_level
        df_p['type_of'] = 'validation' if type_of == 'val' else 'evaluation'
        if len(agg_columns) == 0:
            df_p['agg_column1'] = 'Total'
            df_p['agg_column2'] = 'X'
        elif len(agg_columns) == 1:
            df_p['agg_column1'] = df_pred[agg_columns[0]].values
            df_p['agg_column2'] = 'X'
        else:
            df_p['agg_column1'] = df_pred[agg_columns[0]].values
            df_p['agg_column2'] = df_pred[agg_columns[1]].values
            
        df_p = df_p[['Level', 'agg_column1', 'agg_column2', 'd', 'quantile', 'pred', 'type_of']]
        
        res.append(_down_cast(df_p))
        
    # REMOVE THIS
    if PLOT_PREDICTIONS:
        plt.show()
    # REMOVE THIS
        
    # remove to reduce memory usage asap
    del features
        
    # storing predictions in specified file + folder
    df_sub_val = pd.concat(res)
    group_names = '_'.join(group_columns)
    if group_names == '':
        group_names = 'Total_X'

    if store_submissions_path:
        if USE_ALL:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + store_submissions_path + f'lgb_{type_of}_nt_{group_names}_use_all.csv'
        elif 'kbest' in include_columns:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + store_submissions_path + f'lgb_{type_of}_nt_{group_names}_include_k_best.csv'
        elif SPARSE_FEATURES:
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + store_submissions_path + f'lgb_{type_of}_nt_{group_names}_sparse.csv'  
        elif include_columns == None:
            exclude_names = 'None' if len(exclude_prefix_list) == 0 else '_'.join(exclude_prefix_list)
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + store_submissions_path + f'lgb_{type_of}_nt_{group_names}_exclude_{exclude_names}.csv'
        elif isinstance(include_columns, list):
            exclude_names = 'None' if len(include_prefix_list) == 0 else '_'.join(include_prefix_list)
            file_path = f'../data/uncertainty/fold_{str(sub_d_start)}/' + store_submissions_path + f'lgb_{type_of}_nt_{group_names}_include_{exclude_names}.csv'

        df_sub_val.to_csv(file_path, index = False)
        logger.info('saved under: ' + file_path)

In [9]:
EXPERIMENTS_DICT = {
    "seasonal": {
        "BASE": [],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['seasonal_weekday','auto_sold_ewm'],
            ['seasonal_monthday','auto_sold_ewm'],
            ['seasonal_weekday','seasonal_monthday','auto_sold_ewm'],
            ['seasonal','auto_sold_ewm'],
        ]
    },
    "state vs. store": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['state_id',],
            ['store_id',],
            ['state_id', 'store_id']
        ]
    },
    "ewm vs. ma": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ewm'],
            ['auto_sold_ma'],
            ['auto_sold_ewm', 'auto_sold_ma'],
        ]
    },
    "quantiles vs. std": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['auto_sold_qtile'],
            ['auto_sold_std'],
            ['auto_sold_qtile','auto_sold_std'],   
        ]
    },
    "price auto/momentum": {
        "BASE": ['seasonal', 'auto_sold_ma'],
        "INCLUDE_COLUMNS_LIST": [
            [],
            ['price_auto_std'],
            ['price_momentum'],
            ['price_uncond'],
            ['price_auto_std', 'price_momentum'],
            ['price_auto_std', 'price_momentum', 'price_uncond']
        ]
    },
    "best models": {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'state_id', 'store_id'],
            ['auto_sold_ma', 'auto_sold_std', 'state_id', 'store_id'],
        ]
    },
    "full vs. sparse ma" : {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id']
        ]
    },
    "sparse vs. kbest": {
        "BASE": ['seasonal', 'state_id', 'store_id'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold', 'price', 'kbest'],
            ['auto_sold_ewm_112', 'auto_sold_ewm_28',
             'auto_sold_qtile_28_0.5', 'auto_sold_ma_28', 
             'auto_sold_qtile_28_0.9',],
        ]
    },
    'full vs. sparse': {
        "BASE": ['seasonal'],
        "INCLUDE_COLUMNS_LIST": [
            ['auto_sold_ma', 'auto_sold_std', 'auto_sold_qtile', 'auto_sold_ewm', 'state_id', 'store_id'],
            ['auto_sold_ma_28', 'auto_sold_ma_56', 'auto_sold_ma_168', 'state_id', 'store_id'],
            ['auto_sold_std_3', 'auto_sold_std_56', 'auto_sold_std_168', 
            'auto_sold_ma_7',  'auto_sold_ma_28', 'auto_sold_ma_56', 
            'auto_sold_qtile_28_0.25', 'auto_sold_qtile_168_0.25', 'auto_sold_qtile_56_0.1', 
            'state_id', 'store_id'],
        ]
    },
}
list(EXPERIMENTS_DICT.keys())

['seasonal',
 'state vs. store',
 'ewm vs. ma',
 'quantiles vs. std',
 'price auto/momentum',
 'best models',
 'full vs. sparse ma',
 'sparse vs. kbest',
 'full vs. sparse']

In [None]:
USE_ALL = False
SPARSE_FEATURES = None
PLOT_PREDICTIONS = False

undersampling_dict = {
    'Level10': .1, #.001
    'Level11': .1, #.0001
    'Level12': .1 #.00001
}

HIGH_UNDERSAMPLING = True
TEST_NUMBER = 9 # 9
TEST_NUMB = 0 # 0
PARAM_GRID_TRAIN = {
    'objective': 'quantile',
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': -1,
    'n_jobs': 4,
    "num_leaves": 10, # 30
    "hist_pool_size": 300,
    "learning_rate": .01, # .01
    "n_estimators": 1000, #1000
    "max_depth": 10, #10
}
PARAM_GRID_TRAIN_HIGH_LEVEL = {
    'objective': 'quantile',
    # 'metric': 'quantile', # Use Root Mean Squared Error (RMSE) as the evaluation metric
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': -1,
    'n_jobs': 4,
    "num_leaves": 30,
    "hist_pool_size": 300,
    # 'feature_fraction': 0.9, #.5
    # 'bagging_fraction': .8,
    "learning_rate": .01, # .01 always
    "n_estimators": 3000, # 3000 always
    "max_depth": 10,
}

params = {
    'objective': 'quantile',
    # 'metric': 'quantile', # Use Root Mean Squared Error (RMSE) as the evaluation metric
    'boosting_type': 'gbdt',
    'random_state': 43,
    'verbose': -100,
    'n_jobs': 4,
    # 'eval_at': 10,
    'hist_pool_size': 300,
    'verbose_eval': -100,
}
param_grid = {
    'max_depth': [10,],
    'n_estimators': [3000],#[200, 500, 1000, 2000],
    # 'min_child_samples': [4],
    # 'min_child_weight': [0,0.1],
    'num_leaves': [30], #[10, 30, 90]
    'learning_rate': [0.001, 0.005, 0.01, 0.02]# [.001, .005, .01, .02],#[0.04, 0.07, 0.1],   # 0.02, 0.03,        
    # 'subsample': [ 0.9, 1 ],
    # 'subsample_freq': [1],
}

# ALL_PREFIXES = ['auto_sold', 'auto_sold_ma', 'auto_sold_std', 'auto_sold_ewm', 'auto_sold_qtile',
#     'price_momentum', 'price_uncond', 'price_auto_std','seasonal_', 'state_', 'store_',
# ]

experiment_name = 'seasonal'
experiment_name = 'full vs. sparse'
experiment_specs = EXPERIMENTS_DICT[experiment_name]
BASE = experiment_specs['BASE']
INCLUDE_COLUMNS_LIST = experiment_specs['INCLUDE_COLUMNS_LIST']

INCLUDE_COLUMNS_LIST = [BASE + i for i in INCLUDE_COLUMNS_LIST]
DO_GRID_SEARCH = False
PLOT_EVAL = False

EXCLUDE_COLUMNS_LIST = ()
logger.info('starting with all EXCLUDE_COLUMNS')
for exclude_columns in EXCLUDE_COLUMNS_LIST: # for each specified feature combination
    logger.info(f'Exclude columns: {str(exclude_columns)}')
    for sub_d_start in D_CROSS_VAL_START_LIST:
        for agg_level in list(AGG_LEVEL_COLUMNS.keys())[TEST_NUMB:TEST_NUMBER]: # for each aggregation level
            logger.info(f'starting with agg_level: {agg_level}')
            train_level_all_quantiles(
                agg_level,
                sub_d_start=sub_d_start,
                type_of='val', 
                exclude_columns=exclude_columns,
                do_grid_search=DO_GRID_SEARCH,
                store_submissions_path=None#'temp_submissions/',
            )
logger.info('finished all EXCLUDE_COLUMNS')

logger.info('---------------------------------')            
logger.info('starting with all INCLUDE_COLUMNS')            
for include_columns in INCLUDE_COLUMNS_LIST: # for each specified feature combination
    logger.info(f'Include columns: {str(include_columns)}')
    for sub_d_start in D_CROSS_VAL_START_LIST:
        for agg_level in list(AGG_LEVEL_COLUMNS.keys())[TEST_NUMB:TEST_NUMBER]: # for each aggregation level
            logger.info(f'starting with agg_level: {agg_level}')
            train_level_all_quantiles(
                agg_level,
                sub_d_start=sub_d_start,
                type_of='val', 
                exclude_columns=None,
                include_columns=include_columns,
                do_grid_search=DO_GRID_SEARCH,
                store_submissions_path=None#'temp_submissions/',
            )
logger.info('finished all INCLUDE_COLUMNS')

### Load val + eval prediction files and merge to one submission file

In [11]:
def read_concat_predictions(fold_name: int, exclude_columns: list = [], include_columns: list = [], sparse = False, use_all = False, load_submissions_path: str = 'temp_submissions/'):
    """ 
    For specified fold, read the predictions for all aggregation levels 
    and stack them together in one dataframe.
    """
    if fold_name not in constants.D_CROSS_VAL_START_LIST:
        raise ValueError('fold_name must be a value in D_CV_START_LIST')
        
    exclude_columns = '_'.join(exclude_columns)
    if exclude_columns == '':
        exclude_columns = 'None'

    logger.info('loading files under path:' + f'../data/uncertainty/fold_{fold_name}/' + load_submissions_path)

    dfs: list = []
    for level in list(AGG_LEVEL_COLUMNS.keys())[TEST_NUMB:TEST_NUMBER]:
        agg_columns = AGG_LEVEL_COLUMNS[level]
        group_names = '_'.join(agg_columns)
        if group_names == '':
            group_names = 'Total_X'
        
        file_path = f'../data/uncertainty/fold_{str(fold_name)}/' + load_submissions_path 
        file_path += f'lgb_val_nt_{group_names}_'
        if use_all:
            file_path += f'use_all.csv'  
        elif include_columns == None:
            file_path += f'exclude_{"_".join(exclude_columns)}.csv'            
        elif isinstance(include_columns, list):
            file_path += f'include_{"_".join(include_columns)}.csv'
        
        dfs.append(file_path)
    return ensemble_submissions_uncertainty(dfs)

### For Validation Prediction, we can compute WRMSSE locally

In [12]:
# these variables are used later on
FORCE_RELOAD = False
try:
    # simple code to check if variable exists
    d_int + 1
    if FORCE_RELOAD:
        raise Exception()
except:
    # if not, load again
    # takes about 2-3 minutes to reload and parse
    # not the most beautiful method but it works
    d = pd.read_parquet('../data/uncertainty/cv_template/temp.parquet')
    try:
        d_int = pd.read_parquet('../data/uncertainty/cv_template/temp_d_int.parquet')['d_int']
    except:
        d_int = d['d'].apply(lambda x: int(x.split('_')[1]))
        d_int.to_frame('d_int').to_parquet('../data/uncertainty/cv_template/temp_d_int.parquet', index = False)

In [13]:
def perform_cv(df: pd.DataFrame, df_sub: pd.DataFrame):
    
    # to be able to merge
    df_sub['id_merge'] = df_sub['id']\
        .apply(lambda x: x.split('.')[0])
    df_sub['quantile'] = df_sub['id']\
        .apply(
            lambda x: float(
                '.'.join([
                x.split('.')[-2], 
                x.split('.')[-1].split('_')[0]
                ])
            )
        )

    # merge predictions in cv template
    p = pd.merge(
        df,
        df_sub,
        how='left',
        on=['id_merge', 'd']
    )
    # del df; del df_sub_val
    p['id_merge'] = p['id_merge'].astype(str)

    for c in ['sold', 'revenue']:
        p[c] = p[c].astype(np.float32)
    # d = d[d_int < (D_CV_START + 28)]

    return WSPL(p, [f'd_{i}' for i in range(D_CV_START, D_CV_START + 500)])

In [14]:
EXPERIMENTS_DICT.keys()

dict_keys(['seasonal', 'state vs. store', 'ewm vs. ma', 'quantiles vs. std', 'price auto/momentum', 'best models', 'full vs. sparse ma', 'sparse vs. kbest', 'full vs. sparse'])

In [None]:
FILE_NAME_ALL_RESULTS = '../data/uncertainty/all_results.json'
USE_ALL = False
FOLDER = 'temp_submissions/'

TEST_NUMB = 0
TEST_NUMBER = 9

experiment_name = 'price auto/momentum'
experiment_specs = EXPERIMENTS_DICT[experiment_name]
BASE = experiment_specs['BASE']
INCLUDE_COLUMNS_LIST = experiment_specs['INCLUDE_COLUMNS_LIST']
INCLUDE_COLUMNS_LIST = [BASE + i for i in INCLUDE_COLUMNS_LIST]

# load dict to store results in
from utils.utils import load_results_as_json
results = load_results_as_json(FILE_NAME_ALL_RESULTS)

EXCLUDE_COLUMNS_LIST = []
logger.info('start evaluating exclude columns')
for EXCLUDE_COLUMNS in EXCLUDE_COLUMNS_LIST:
    if 'exclude_' + ' '.join(EXCLUDE_COLUMNS) not in results.keys():
        results['exclude_' + ' '.join(EXCLUDE_COLUMNS)] = {}
    logger.info('--------------- ' + str(EXCLUDE_COLUMNS) + ' ---------------')
    res = []
    for D_CV_START in D_CROSS_VAL_START_LIST:
        mean_wspl, res_dict = perform_cv(
            _down_cast(d)[d_int < (D_CV_START + DAYS)], 
            read_concat_predictions(
                fold_name = D_CV_START, 
                exclude_columns = EXCLUDE_COLUMNS,
                include_columns = None,
                use_all=USE_ALL,
                load_submissions_path=FOLDER
            )
        )
        res.append(mean_wspl)
        results['exclude_' + ' '.join(EXCLUDE_COLUMNS)]['fold_' + str(D_CV_START)] = res_dict 
        logger.info(str(D_CV_START) + ' - wspl: ' + str(mean_wspl))

    logger.info(' - mean wspl: ' + str(np.mean(res)) + ' +/- ' + str(np.std(res)))
    logger.info(str(D_CV_START) + ' - raw results: ' + str(res))

logger.info('start evaluating include columns')
for INCLUDE_COLUMNS in INCLUDE_COLUMNS_LIST:
    if 'kbest' in INCLUDE_COLUMNS:
        INCLUDE_COLUMNS = ['k_best']
    if 'include_' + ' '.join(INCLUDE_COLUMNS) not in results.keys():
        results['include_' + ' '.join(INCLUDE_COLUMNS)] = {}
    logger.info('--------------- ' + str(INCLUDE_COLUMNS) + ' ---------------')
    res = []
    for D_CV_START in D_CROSS_VAL_START_LIST:
        mean_wspl, res_dict = perform_cv(
            _down_cast(d)[d_int < (D_CV_START + DAYS)], 
            read_concat_predictions(
                fold_name = D_CV_START, 
                exclude_columns = [], 
                include_columns = INCLUDE_COLUMNS,
                use_all=USE_ALL,
                load_submissions_path=FOLDER
            )
        )
        res.append(mean_wspl)
        results['include_' + ' '.join(INCLUDE_COLUMNS)]['fold_' + str(D_CV_START)] = res_dict 
        logger.info(str(D_CV_START) + ' - wspl: ' + str(mean_wspl))

    logger.info(str(D_CV_START) + ' - mean wspl: ' + str(np.mean(res)) + ' +/- ' + str(np.std(res)))
    logger.info(str(D_CV_START) + ' - raw results: ' + str(res))

from utils.utils import store_results_as_json
store_results_as_json(results, FILE_NAME_ALL_RESULTS)