In [1]:
# Setup notebook
from warnings import simplefilter
from pathlib import Path
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # plotting
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import TimeSeriesSplit
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess 
from statsmodels.graphics.tsaplots import plot_pacf


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/store-sales-time-series-forecasting/oil.csv
/kaggle/input/store-sales-time-series-forecasting/sample_submission.csv
/kaggle/input/store-sales-time-series-forecasting/holidays_events.csv
/kaggle/input/store-sales-time-series-forecasting/stores.csv
/kaggle/input/store-sales-time-series-forecasting/train.csv
/kaggle/input/store-sales-time-series-forecasting/test.csv
/kaggle/input/store-sales-time-series-forecasting/transactions.csv


In [2]:
# load training set
comp_dir = Path('../input/store-sales-time-series-forecasting')
store_sales = pd.read_csv(
    comp_dir / 'train.csv',
    usecols = ['date', 'store_nbr', 'family', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32'},
    parse_dates=['date'],
    infer_datetime_format=True,
    )

store_sales['date'] = store_sales.date.dt.to_period('D') # convert dates to pandas period dtype

# define a new Data Frame to manipulate, here I'm following code posted in the Kaggle Time Series exercise
# NOTE -> we aren't using information on individual stores (store_nbr) yet, but will likely want to include soon
family_sales = (
    store_sales
    .groupby(['family', 'date']) 
    .mean() 
    .unstack('family') 
    .loc['2017', ['sales', 'onpromotion']]
)    

# sneak peek at training set
#family_sales.head()

In [3]:
# grab all categories of sales
families = store_sales.family.unique().astype(str)

#print("Shape of training set is:", family_sales.shape, '\n')
#print("There are ", families.size, "categories: ", families)

In [4]:
# don't need onpromotion data yet
sales = family_sales.loc[:, 'sales'].squeeze()

# set seasonality for model
fourier = CalendarFourier(freq='M', order=4)
    
# create time features 
dp = DeterministicProcess(
     constant=True, # includes a constant
     index=sales.index, # should be the date
     order=1, # look for a linear trend
     seasonal=True, # include seasonality
     drop=True, # drop linearly dependent terms
     additional_terms=[fourier], # add fourier pairs for modeling seasons
     )
X_time = dp.in_sample()
X_time['NewYearsDay'] = (X_time.index.dayofyear == 1)

# initialize variables
y_deseason = pd.DataFrame(index = sales.index) # deseasoned data

# train model to detrend and deseason the data
model_season = LinearRegression().fit(X_time, sales)
          
# store deseasoned data
y_pred = model_season.predict(X_time)
y_deseason= pd.DataFrame(sales - y_pred, index = sales.index, columns = sales.columns)    

# store model weights
model_params = pd.DataFrame(model_season.coef_, index = families, columns = [X_time.columns]).T # store model weights as DataFrame
intercepts = pd.Series(model_season.intercept_, index = families, name = 'intercept') # store intercepts as Series

In [5]:
# load in the test set
test = pd.read_csv(
    comp_dir / 'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family']).sort_index()

In [6]:
# make predictions for test set
# will use a dictionary to store id:sales for every test value
test_dict = {}
       
# generate features for 16-day forecast window
X_test = dp.out_of_sample(steps = 16)
X_test['NewYearsDay'] = (X_test.index.dayofyear == 1)

# loop over all stores
for store_nbr, test_0 in test.groupby(level = 0):
    
    # loop over all sales categories
    for family, test_1 in test_0.groupby(level = 1):
        
        # pick current category
        test_family = test_1.index.get_level_values(1)[0]
    
        # fetch linear model for detrend & deseason
        model_1 = LinearRegression()
        model_1.coef_ = model_params[test_family]
        model_1.intercept_ = intercepts[test_family]
        model_1.feature_names_in_ = X_test.columns
        
        # predict sales for forecast window, output is a list
        y_test = model_1.predict(X_test)
        y_test = np.where(y_test<0, 0, y_test) # metric is RMSLE, replace negative values to avoid log errors
        
        # grab test ids, output is a list
        keys = test_1.id.to_string(header = False, index = False).split()
        
        # store ids and inferences in a dictionary
        for i, key in enumerate(keys):
            value = y_test[i]
            test_dict[key] = value
            
# convert dictionary of inferences to DataFrame to write to .csv
test_sub = pd.DataFrame.from_dict(test_dict, orient = 'index', columns = ['sales'])
test_sub.index.name = 'id' 
test_sub = test_sub.sort_index(ascending = True)

test_sub.to_csv('submission.csv')