In [1]:
import os
import sys
import math
import logging
from pathlib import Path

import numpy as np
import scipy as sp

%load_ext autoreload
%autoreload 2

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import seaborn as sns
sns.set_context("poster")
sns.set(rc={'figure.figsize': (16, 9.)})
sns.set_style("whitegrid")

import pandas as pd
pd.set_option("display.max_rows", 120)
pd.set_option("display.max_columns", 120)

logging.basicConfig(level=logging.INFO, stream=sys.stdout)

In [2]:
from bhm_at_scale.preprocess import gen_splits, gen_partitions, encode, make_cube
from bhm_at_scale.utils import reorder_cols

To download files from Kaggle register a user first and create a file in your homedirectory `.kaggle/kaggle.json` with content `{"username":"YOURUSERNAME","key":"YOURKEY"}`

In [3]:
import kaggle

kaggle.api.authenticate()
kaggle.api.dataset_download_files('pratyushakar/rossmann-store-sales', path='../data/raw', unzip=True)

## Data cleansing and basic feature engineering

In [4]:
stores = pd.read_csv('../data/raw/store.csv')
df = (pd.read_csv('../data/raw/train.csv', infer_datetime_format=True, parse_dates=['Date'])
         .assign(StateHoliday=lambda df: df.StateHoliday.map({'a': 1, 'b': 2, 'c': 3, '0': 0, 0: 0}))
         .merge(stores, how='left', on='Store')
         .sort_values(['Store', 'Date'])
         .assign(Assortment=lambda df: df.Assortment.map({'a': 1, 'b': 2, 'c': 3}))
         .assign(StoreType=lambda df: df.StoreType.map({'a': 1, 'b': 2, 'c': 3, 'd': 4}))
         .assign(StoreId=lambda df: df.groupby(['Store']).ngroup()) # make IDs contiguous
         .assign(Timestep=lambda df: df.groupby(['Store'])['Date'].cumcount())
         .assign(StoreVariant=lambda df: df['StoreType']*10 + df['Assortment'])
         .drop([c for c in stores.columns if c.startswith('Competition')], axis=1)
         .drop([c for c in stores.columns if c.startswith('Promo2S')], axis=1)
         .drop(['PromoInterval', 'StoreType', 'Assortment'], axis=1)
     )

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [5]:
# Use this to split stores up into 'virtual' smaller stores just to demonstrate the scalability of this approach
# df = (df.groupby('Store', as_index=False).apply(lambda df: df.assign(part=gen_partitions(len(df), 7, 22)))
#         .reset_index(drop=True)
#         .query('part >= 0')
#         .assign(Store=lambda x: x.Store*1000+x.part)
#         .drop(['part'], axis=1)
#         .sort_values(['StoreId', 'Date'])
#         .reset_index(drop=True))

## Dummy encoding of categorical values

In [6]:
edf = encode(df.drop(columns='Customers'), ['DayOfWeek', 'StoreVariant', 'StateHoliday'], drop_first=False)
edf = edf.drop(columns=['Date', 'Store'])
edf.loc[edf['Open'] == 0, 'Sales'] = np.nan
edf = edf.drop(columns='Open')  # Keep the open column

In [7]:
edf = reorder_cols(edf, first=['StoreId', 'Timestep'], last='Sales')

In [8]:
edf.to_csv('../data/preprocessed/edf.csv', index=False)

## Generate a multi-dimensional array for input

In [9]:
X = make_cube(edf, ['StoreId', 'Timestep'])

In [10]:
X_train, X_test = X[:1000, ...], X[1000:, ...]

In [11]:
np.savez('../data/preprocessed/X_train.npz', X_train)
np.savez('../data/preprocessed/X_test.npz', X_test)

In [12]:
df.to_csv('../data/result/df.csv', index=False)