In [3]:
import pandas as pd
import numpy as np
from downcast import reduce
import pickle
import warnings
from catboost import CatBoostRegressor
from typing import Union
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [109]:
sales = pd.read_pickle("sales_ad.pkl")
cal = pd.read_pickle("cal_ad.pkl")
price = pd.read_pickle("prices_ad.pkl")

In [4]:
#Performance metric 

class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            all_scores.append(lv_scores.sum())

        return np.mean(all_scores)

### Function 1 

In [117]:
def func_1(data_point):
    if type(data_point) != pd.core.frame.DataFrame:
        data_point = pd.DataFrame(data_point,columns = sales.columns)
    
    d_cols = [d for d in sales.columns if 'd_' in d]
    data = data_point.drop(d_cols[:1883],axis = 1)
    
    #making cols for days 1942-69 amd filling it with zero
    for day in range(1942,1970):
        data['d_' + str(day)] = 0
        data['d_' + str(day)] = data['d_' + str(day)].astype(np.int16)
    
    pre_data = pd.melt(data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                      var_name='d', value_name='sales')
    
    #combining the dataset 
    pre_data = pd.merge(pre_data, cal, on='d', how='left')
    pre_data = pd.merge(pre_data, price, on=['store_id','item_id','wm_yr_wk'], how='left')
    
    #fil the missing sell price values by mean imputaion
    pre_data["sell_price"].fillna(pre_data.groupby("id")["sell_price"].transform("mean"), inplace=True)
    
    pre_data.drop(columns=["date","weekday"], inplace=True)
    pre_data['d'] = pre_data['d'].apply(lambda a: a.split('_')[1]).astype(np.int16)

    #calculating lags feature
    lags = [1,2,3,5,7,14,21,28]
    for lag in lags:
        pre_data["lag_" + str(lag)] = pre_data.groupby("id")["sales"].shift(lag).astype(np.float16)

    #calculating rolling features
    pre_data['rolling_mean_10'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(10).mean())
    pre_data['rolling_mean_20'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(20).mean())
    pre_data['rolling_mean_30'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(30).mean())

    #for query data points keeping lag and rolling features same as previous 28 days lag and rolling features 
    pre_data.iloc[1941:,-11:] = pre_data.iloc[1913:1941,-11:].values
    pre_data = pre_data[pre_data['d'] >= 1942]
    pre_data.drop(['id', 'd','sales', 'wm_yr_wk'],axis = 1,inplace = True)

    cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1','event_type_1', 'event_name_2', 'event_type_2']

    for i in cat_cols:
        if i in sales.columns:
            dict_id = dict(enumerate(sales[i].cat.categories))
        else:
            dict_id = dict(enumerate(cal[i].cat.categories))
        
        keys = list(dict_id.keys())
        values = list(dict_id.values())
        og_values = list(pre_data[i].unique())
        replace_values = []
        for j in og_values:
            if j == 'No_event':
                replace_values.append(-1)
            else:
                replace_values.append(keys[values.index(j)])

        pre_data[i].replace(og_values,replace_values,inplace = True)        
            
    model_file = open('cgb.pkl', 'rb')
    model = pickle.load(model_file)

    forecast_values = model.predict(pre_data)

    return forecast_values

#### For a single  point as DataFrame

In [110]:
#picking a random data point
data_point = sales.sample()
data_point

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1160,HOUSEHOLD_2_064_CA_1_evaluation,HOUSEHOLD_2_064,HOUSEHOLD_2,HOUSEHOLD,CA_1,CA,0,3,0,0,...,1,1,2,0,0,0,1,2,1,0


In [118]:
func_1(data_point)

array([0.64858867, 0.51179078, 0.36817153, 0.36198082, 0.3986266 ,
       0.49778588, 0.38601405, 0.13291047, 0.11331706, 0.11591272,
       0.10329076, 0.11125815, 0.14786973, 0.14553241, 0.09866009,
       0.09450613, 0.08564963, 0.08564963, 0.08946307, 0.12483343,
       0.12297282, 0.07890002, 0.08305397, 0.08305397, 0.07890002,
       0.08686741, 0.12067947, 0.11881887])

#### For a single  point as numpy array

In [119]:
data_point = sales.sample().values
data_point

array([['FOODS_2_366_CA_4_evaluation', 'FOODS_2_366', 'FOODS_2', ..., 0,
        2, 1]], dtype=object)

In [120]:
func_1(data_point)

array([0.40268731, 0.31032722, 0.32113162, 0.29916445, 0.29271972,
       0.36386782, 0.33534181, 0.26243998, 0.1132565 , 0.09625875,
       0.08740224, 0.09536964, 0.12965842, 0.13195177, 0.08740224,
       0.08324829, 0.08740224, 0.08740224, 0.09121568, 0.12658604,
       0.12472544, 0.08065263, 0.08480659, 0.08480659, 0.08065263,
       0.08862002, 0.12243209, 0.12057148])

#### For multiple points as numpy array

In [121]:
#taking 5 random points

data_points = sales.sample(5).values
data_points

array([['HOUSEHOLD_2_260_WI_2_evaluation', 'HOUSEHOLD_2_260',
        'HOUSEHOLD_2', ..., 0, 1, 0],
       ['HOBBIES_1_178_CA_2_evaluation', 'HOBBIES_1_178', 'HOBBIES_1',
        ..., 0, 1, 2],
       ['HOBBIES_1_255_WI_2_evaluation', 'HOBBIES_1_255', 'HOBBIES_1',
        ..., 0, 1, 0],
       ['FOODS_3_205_CA_2_evaluation', 'FOODS_3_205', 'FOODS_3', ..., 0,
        1, 3],
       ['HOBBIES_1_011_TX_3_evaluation', 'HOBBIES_1_011', 'HOBBIES_1',
        ..., 0, 0, 1]], dtype=object)

In [125]:
pred_array = func_1(data_points)
pred_array = np.reshape(pred_array, (-1, 28),order = 'F')
for i in range(len(pred_array)):
    print(f"Results of data point {i+1} is \n{pred_array[i]}")

Results of data point 1 is 
[0.32184707 0.30285269 0.33149263 0.30774923 0.27452983 0.27434114
 0.1583762  0.11150389 0.09654115 0.09913681 0.10329076 0.10662748
 0.14786973 0.14553241 0.08564963 0.08149567 0.08564963 0.08564963
 0.08946307 0.12483343 0.12297282 0.07890002 0.08305397 0.08305397
 0.07890002 0.08686741 0.12067947 0.11881887]
Results of data point 2 is 
[2.54046498 1.58383697 1.37242342 1.0724422  1.04058291 1.15509363
 0.80415295 0.66389606 0.57562575 0.42986432 0.43401827 0.34892415
 0.32507672 0.22247847 0.17097548 0.14541494 0.13655844 0.08307117
 0.0868846  0.12225496 0.12039436 0.07632155 0.08047551 0.08047551
 0.07632155 0.08428894 0.11810101 0.1162404 ]
Results of data point 3 is 
[0.41827338 0.38673357 0.40310028 0.24996947 0.11873901 0.18173156
 0.15764393 0.11540229 0.09580888 0.09377386 0.09792782 0.10589521
 0.14713746 0.12483621 0.08491736 0.0807634  0.08491736 0.08491736
 0.08873079 0.12410116 0.12224055 0.07816774 0.0823217  0.0823217
 0.07816774 0.0861351

### Function 2

In [133]:
def func_2(data_point):
    if type(data_point) != pd.core.frame.DataFrame:
        data_point = pd.DataFrame(data_point,columns = sales.columns)
    
    #getting previous 30 days data also for calculating lags and rolling feature 
    d_cols = [d for d in sales.columns if 'd_' in d]
    data = data_point.drop(d_cols[:1883],axis = 1)
    
    #Converting from wide form to long form
    pre_data = pd.melt(data, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
                      var_name='d', value_name='sales')
    
    #combining the dataset 
    pre_data = pd.merge(pre_data, cal, on='d', how='left')
    pre_data = pd.merge(pre_data, price, on=['store_id','item_id','wm_yr_wk'], how='left')
    
    #fil the missing sell price values by mean imputaion
    pre_data["sell_price"].fillna(pre_data.groupby("id")["sell_price"].transform("mean"), inplace=True)
    
    pre_data.drop(columns=["date","weekday"], inplace=True)
    pre_data['d'] = pre_data['d'].apply(lambda a: a.split('_')[1]).astype(np.int16)

    #calculating lags feature
    lags = [1,2,3,5,7,14,21,28]
    for lag in lags:
        pre_data["lag_" + str(lag)] = pre_data.groupby("id")["sales"].shift(lag).astype(np.float16)

    #calculating rolling features
    pre_data['rolling_mean_10'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(10).mean())
    pre_data['rolling_mean_20'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(20).mean())
    pre_data['rolling_mean_30'] = pre_data.groupby("id")['sales'].transform(lambda x: x.rolling(30).mean())

    pre_data = pre_data[pre_data['d'] >= 1914]
    pre_data.drop(['id', 'd','sales', 'wm_yr_wk'],axis = 1,inplace = True)

    cat_cols = ['item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'event_name_1','event_type_1', 'event_name_2', 'event_type_2']

    for i in cat_cols:
        if i in sales.columns:
            dict_id = dict(enumerate(sales[i].cat.categories))
        else:
            dict_id = dict(enumerate(cal[i].cat.categories))
        
        keys = list(dict_id.keys())
        values = list(dict_id.values())
        og_values = list(pre_data[i].unique())
        replace_values = []
        for j in og_values:
            if j == 'No_event':
                replace_values.append(-1)
            else:
                replace_values.append(keys[values.index(j)])

        pre_data[i].replace(og_values,replace_values,inplace = True)        
            
    model_file = open('cgb.pkl', 'rb')
    model = pickle.load(model_file)

    fore_array = model.predict(pre_data)
    fore_array = np.reshape(fore_array, (-1, 28),order = 'F')
    preds_val = pd.DataFrame(fore_array,columns = d_cols[-28:])
    train_df = sales.iloc[:,:-28]
    val_df = sales.iloc[:,-28:]
    evaluator = WRMSSEEvaluator(train_df, val_df, cal, price)
    score = evaluator.score(preds_val)
    
    return score

In [134]:
#passing data of all items of all stores because the performance metric is based on hierarchical forecasting scoring.
#it aggregates scores of all heirarchical levels 

final_metric = func_2(sales.values) 
print(f"The final metric: {final_metric}")

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:35<00:00,  2.95s/it]


The final metric: 0.6710504162888493
