## Requirements

In [1]:
import numpy as np
import pandas as pd
import pickle

from math import ceil

## Params

In [None]:
START_DAY = 800
DATA_PATH = '../data/'

TOP_LEVEL_KEYS = ['store_id', 'dept_id', 'd']

## Load interim data

In [None]:
grid_df = pd.read_pickle(DATA_PATH + 'interim/grid_df.pkl')

## Reduce data

In [None]:
if START_DAY > 0:
    grid_df = grid_df[grid_df['d'] >= START_DAY]

## Rework calendar features

In [None]:
# delete some cols first (we're going to recreate some cleaner ones)
grid_df.drop(columns=['wm_yr_wk', 'weekday', 'wday', 'month', 'year'], inplace=True)

# Make some features from date
grid_df['dow'] = grid_df['date'].dt.dayofweek.astype(np.int8)
grid_df['dom'] = grid_df['date'].dt.day.astype(np.int8)
grid_df['month'] = grid_df['date'].dt.month.astype(np.int8)
grid_df['week'] = grid_df['date'].dt.week.astype(np.int8)
grid_df['wom'] = grid_df['dom'].apply(lambda x: ceil(x / 7)).astype(np.int8)
grid_df['quarter'] = grid_df['date'].dt.quarter.astype(np.int8)
grid_df['year'] = grid_df['date'].dt.year.astype(np.int16)

# And other ones
grid_df['is_week_end'] = (grid_df['dow'] >= 5).astype('category')
grid_df['age'] = (grid_df.groupby('id').cumcount() + 1).astype(np.int16)

# delete date
grid_df.drop(columns=['date'], inplace=True)

## Calculate top level sales

In [None]:
top_level_sales = grid_df.groupby(TOP_LEVEL_KEYS).agg(top_sales=('sales', 'sum')).reset_index()
top_level_sales.loc[top_level_sales['d'] > 1941, 'top_sales'] = np.nan # Because sum of NaN equal 0 not NaN in pandas 

## Deduct low level ratio

In [None]:
grid_df = pd.merge(grid_df, top_level_sales, on=TOP_LEVEL_KEYS)

In [None]:
grid_df['sales_ratio'] = grid_df['top_sales']
grid_df.loc[grid_df['sales_ratio'] > 0, 'sales_ratio'] = grid_df['sales'] / grid_df['top_sales']

## Keep only usefull features

In [None]:
grid_df.drop(columns=['event_name_1', 'event_type_1', 'event_name_2', 'event_type_2', 'top_sales'], inplace=True)

## Save as refined data

In [None]:
# Use pickle to not lose dtypes
grid_df.to_pickle(DATA_PATH + 'refined/top_down_df.pkl')

grid_df.info()