### Info on data from Kaggle and UNIC
+ Compeititon's goal was to predict item sales at stores in various locations for two 28-day time periods.
+ Data is focused on series that display intermittency, i.e., sporadic demand including zeros.
+ Based on hierarchical sales data, generously made available by Walmart, starting at the item level and aggregating to that of departments, product categories and stores in three geographical areas of the US: California, Texas, and Wisconsin.
+ Besides the time series data, it also included explanatory variables such as price, promotions, day of the week, and special events (e.g. Super Bowl, Valentine’s Day, and Orthodox Easter) that affect sales which are used to improve forecasting accuracy.
+ calendar.csv - Contains information about the dates on which the products are sold.
+ sales_train_validation.csv - Contains the historical daily unit sales data per product and store [d_1 - d_1913]
+ sell_prices.csv - Contains information about the price of the products sold per store and date.
+ sales_train_evaluation.csv - Includes sales [d_1 - d_1941]

In [1]:
import pandas as pd
import numpy as np
import os

In [4]:
# where the files are strored
DATA_RAW = '../data/raw/m5-forecasting-accuracy/'
DATA_PROCESSED = '../data/processed/'

sales_file = os.path.join(DATA_RAW, "sales_train_validation.csv")
calendar_file = os.path.join(DATA_RAW, "calendar.csv")
prices_file = os.path.join(DATA_RAW, "sell_prices.csv")

In [5]:
# load smaller support data
calendar = pd.read_csv(calendar_file)
prices = pd.read_csv(prices_file)

In [6]:
# need to process data in chunks. Getting memory issues
chunk_size = 500 # num of rows to process
output_chunks = []

reader = pd.read_csv(sales_file, chunksize=chunk_size)

In [11]:
for i, chunk in enumerate(reader):
    print(f"processing chunk {i + 1}")

    # massage the sales dataset into long format instead of wide
    chunk_long = pd.melt(
        chunk,
        id_vars=['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id'],
        var_name='d',
        value_name='sales'
    )

    # start merging out datasets together
    chunk_merged = chunk_long.merge(calendar, on="d", how="left")
    chunk_merged = chunk_merged.merge(prices, on=["store_id", "item_id", "wm_yr_wk"], how="left")

    # look at smaller subset of the data (groceries at the california store 1)
    chunk_merged = chunk_merged.query("store_id =='CA_1' and dept_id == 'FOODS_1'")

    ''' fix missing price data. Forward filling assumes the price stays the same as the last known
     price since we're looking week to week assuming grocery prices stay pretty stable week to week.
    '''
    chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")
    chunk_merged.dropna(inplace=True)

    output_chunks.append(chunk_merged)
    

processing chunk 1


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 2


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 3


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 4


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 5


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 6


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 7


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 8


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 9


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 10


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 11


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 12


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 13


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 14


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 15


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 16


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 17


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 18


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 19


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 20


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 21


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 22


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 23


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 24


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 25


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 26


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 27


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 28


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 29


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 30


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 31


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 32


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 33


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 34


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 35


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 36


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 37


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 38


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 39


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 40


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 41


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 42


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 43


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 44


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 45


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 46


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 47


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 48


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 49


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 50


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 51


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 52


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 53


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


processing chunk 54


  chunk_merged["sell_price"] = chunk_merged["sell_price"].fillna(method="ffill")


In [12]:
# combine all chunks
full_data = pd.concat(output_chunks, ignore_index=True)

In [13]:
# save processed data
os.makedirs(DATA_PROCESSED, exist_ok=True)
output_path = os.path.join(DATA_PROCESSED, "sales_CA_1_FOODS1.csv")
full_data.to_csv(output_path, index=False)

print("saved data to: ", output_path)

saved data to:  ../data/processed/sales_CA_1_FOODS1.csv
