In [None]:
import numpy as np
import pandas as pd
import pickle

## Params

In [None]:
IS_EVAL = False
DATA_PATH = '../data/'
TOP_LEVEL_KEYS = ['store_id', 'dept_id', 'd']

if IS_EVAL:
    PERIOD_LABEL = 'evaluation'
    START_PRED = 1942
    END_PRED = 1969
else:
    PERIOD_LABEL = 'validation'
    START_PRED = 1914
    END_PRED = 1941

## Load interim data

In [None]:
grid_df = pd.read_pickle(DATA_PATH + 'interim/grid_df.pkl')

## Reduce on period

In [None]:
grid_df = grid_df.loc[grid_df['d'] <= END_PRED]

## Create features

In [None]:
grid_df['snap'] = grid_df['snap_CA']
grid_df.loc[grid_df['state_id'] == 'TX', 'snap'] = grid_df.loc[grid_df['state_id'] == 'TX', 'snap_TX'].values
grid_df.loc[grid_df['state_id'] == 'WI', 'snap'] = grid_df.loc[grid_df['state_id'] == 'WI', 'snap_WI'].values
grid_df['snap'] = grid_df['snap'].astype(int)

In [None]:
grid_df['dom'] = grid_df['date'].dt.day.astype(np.int8)

## Calculate top-level sales in Prophet format

In [None]:
prophet_df = grid_df.groupby(TOP_LEVEL_KEYS).agg(ds=('date', 'max'), 
                                                 y=('sales', 'sum'),
                                                 snap=('snap', 'max'),
                                                 price=('sell_price', 'max'),
                                                 dom=('dom', 'max')).reset_index()

prophet_df.loc[prophet_df['d'] >= START_PRED, 'y'] = np.nan # nullify values of prediction period

## Save as refined data

In [None]:
# Use pickle to not lose dtypes
prophet_df.to_pickle(DATA_PATH + 'refined/prophet_df_' + PERIOD_LABEL + '.pkl')
prophet_df.info()