In [22]:
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import plotly.offline as py
py.init_notebook_mode()
import plotly.graph_objs as go
from plotly import tools

## Import and format data

In [23]:
df = pd.read_csv('sales.csv')
df.OrderDate = pd.to_datetime(df.OrderDate)
df.columns = ['date', 'store', 'item', 'sales']
df.date = pd.to_datetime(df.date)
df.item = df.item.astype('category')
df.store = df.store.astype('category')
df.dtypes

date     datetime64[ns]
store          category
item           category
sales             int64
dtype: object

Below are some summary statistics on the data. Overall the quantities for individual items at individual stores is quite small. It would be difficult to forecast daily quantities of individual items and individual stores, so we will work towards forecasting weekly item sales at individual stores.

In [24]:
df.describe(include='all')

Unnamed: 0,date,store,item,sales
count,350796,350796,350796.0,350796.0
unique,301,181,39.0,
top,2016-11-05 00:00:00,SEGWD7,41795.0,
freq,4110,5353,22282.0,
first,2016-02-04 00:00:00,,,
last,2016-11-30 00:00:00,,,
mean,,,,1.681211
std,,,,1.134632
min,,,,1.0
25%,,,,1.0


## Analyze Data

Let's take a look at a few cuts on the data to see if we can spot any trends. Below are plots of a handful of individual stores' sales. It looks like the answer to the question about the big jump in sales in September is the addition of a good number of stores.

### Total Sales

Here is the total sales graph we took a look at yesterday.

In [25]:
df_total = df.groupby(pd.Grouper(freq='W', key='date')).sum().fillna(0).unstack('date', 0)
df_total.index.levels[1]

len(df_total) == len(df_total.index.levels[1])

trace = go.Scatter(
    x = df_total.index.levels[1],
    y = df_total
)

layout = go.Layout(
    title='Total Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='total-sales')

We may want to look at this data to figure out if we need to limit that date range to be from March to November in order to avoid any partial weeks. But that shouldn't be too much of an issue because I'm going to set the training data up to make predictions of sales of items at individual stores, so as long as the drop offs on either end are the result of fewer stores being included in the data, that shouldn't have a negative impact on the model.

### Individual Store Sales

In [26]:
df_1w = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)

rows = 10
cols = 3
spidx = np.arange(rows*cols).reshape(rows,cols)

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]



### New Stores

So let's do a plot of the count of the number of stores with sales in each week. This confirms that there were a lot of new stores added in September.

In [27]:
store_sales = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
stores_with_sales = store_sales['sales'].where(store_sales.sales > 0).count()

stores_with_sales.index

trace = go.Bar(
    x = stores_with_sales.index,
    y = stores_with_sales
)

layout = go.Layout(
    title='No. of Stores with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='stores-with-sales')

### Items

Let's take a look sales for the individual items.

In [28]:
df_1w = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 13
cols = 3

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

spidx = np.arange(rows*cols).reshape(rows,cols)

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]
[ (11,1) x31,y11 ]  [ (11,2) x32,y11 ]  [ (11,3) x33,y11 ]
[ (12,1) x34,y12 ]  [ (12,2) x35,y12 ]  [ (12,3) x36,y12 ]
[ (13,1) x37,y13 ]  [ (13,2) x38,y13 ]  [ (13,3) x39,y13 ]



It also looks like there were some big increases in sales of individual items. It would be interesting to do some more analysis to figure out if those items were in in the new stores that came on.

## Prepare Data

### Add Trailing Average Sales

Now for the actual model. The first thing we'll do is add in a rolling average of the prior three weeks' sales.

In [96]:
df_model = df.groupby(['store', 'item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0)
rolling_sum = df_model.apply(lambda x:x.rolling(window=3).mean())
rolling_sum.shift(-1)
df_model['sales_avg'] = rolling_sum.shift(-1)['sales']
df_model.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sales,sales_avg
store,item,date,Unnamed: 3_level_1,Unnamed: 4_level_1
SEGWD103,41774,2016-02-07,1.0,
SEGWD103,41774,2016-02-14,3.0,2.333333
SEGWD103,41774,2016-02-21,3.0,2.666667
SEGWD103,41774,2016-02-28,2.0,2.666667
SEGWD103,41774,2016-03-06,3.0,3.333333
SEGWD103,41774,2016-03-13,5.0,4.666667
SEGWD103,41774,2016-03-20,6.0,5.666667
SEGWD103,41774,2016-03-27,6.0,5.333333
SEGWD103,41774,2016-04-03,4.0,5.333333
SEGWD103,41774,2016-04-10,6.0,5.0


In [100]:
df_model.loc['SEGWD7'].to_csv('SEGWD7.csv')

### Remove Missing Stores and Items

In [30]:
df_model['cum_sales'] = df_model.groupby(level=[0,1]).cumsum()['sales']
df_model.reset_index(inplace=True)
df_model.describe(include='all')

Unnamed: 0,store,item,date,sales,sales_avg,cum_sales
count,310596,310596.0,310596,310596.0,310594.0,310596.0
unique,181,39.0,44,,,
top,SEGWD97,42052.0,2016-07-10 00:00:00,,,
freq,1716,7964.0,7059,,,
first,,,2016-02-07 00:00:00,,,
last,,,2016-12-04 00:00:00,,,
mean,,,,1.898807,1.898811,36.002048
std,,,,3.853454,3.667099,88.267833
min,,,,0.0,0.0,0.0
25%,,,,0.0,0.0,0.0


In [31]:
df_model_masked = df_model[df_model.cum_sales != 0]
df_model_masked.describe(include='all')

Unnamed: 0,store,item,date,sales,sales_avg,cum_sales
count,144822,144822.0,144822,144822.0,144820.0,144822.0
unique,181,39.0,44,,,
top,SEGWD7,41793.0,2016-11-27 00:00:00,,,
freq,1336,4759.0,6424,,,
first,,,2016-02-07 00:00:00,,,
last,,,2016-12-04 00:00:00,,,
mean,,,,4.072323,4.030594,77.212661
std,,,,4.79534,4.501835,116.308495
min,,,,0.0,0.0,1.0
25%,,,,0.0,0.666667,9.0


### Encode Store and Item Categories

It looks like that eliminated quite a few observations, over half of them. Now we can encode the store and item variables as binary classifications and then we can use the data to train a model.

In [79]:
stores = pd.get_dummies(df_model_masked['store'])
items = pd.get_dummies(df_model_masked['item'])
df_final = pd.concat([df_model_masked, stores, items], axis=1).dropna(how='any')
df_final.drop(['date', 'cum_sales', 'store', 'item', 41793, 'SEGWD103'], axis=1, inplace=True)
# df_final.to_csv('modeldata.csv')

In [80]:
df_final.head()

Unnamed: 0,sales,sales_avg,SEGWD104,SEGWD116,SEGWD12,SEGWD123,SEGWD125,SEGWD129,SEGWD135,SEGWD138,...,42043,42044,42045,42046,42047,42048,42049,42050,42051,42052
1,3.0,2.333333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3.0,2.666667,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,2.666667,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,3.333333,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5.0,4.666667,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
df_final.describe()

Unnamed: 0,sales,sales_avg,SEGWD104,SEGWD116,SEGWD12,SEGWD123,SEGWD125,SEGWD129,SEGWD135,SEGWD138,...,42043,42044,42045,42046,42047,42048,42049,42050,42051,42052
count,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,...,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0,144820.0
mean,4.072373,4.030594,0.003349,0.003356,0.009094,0.009142,0.00337,0.009198,0.009177,0.009142,...,0.016275,0.016013,0.01622,0.015385,0.016641,0.016545,0.016275,0.016655,0.016614,0.016344
std,4.795355,4.501835,0.057774,0.057833,0.094928,0.095178,0.057951,0.095463,0.095356,0.095178,...,0.126533,0.125526,0.126322,0.123077,0.127924,0.127558,0.126533,0.127976,0.127819,0.126797
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,3.0,2.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6.0,5.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,118.0,86.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [83]:
data = df_final.dropna().as_matrix()
data.shape

(144820, 220)

### Split Train and Test Data

We'll save 30% of the data to test our model.

In [84]:
np.random.shuffle(data)
split = int(0.70*data.shape[0])
X_train = np.ones(data[:split,:].shape)
X_test = np.ones(data[split:,:].shape)

X_train[:,1:] = data[:split,1:]
y_train = data[:split,0]

X_test[:,1:] = data[split:,1:]
y_test = data[split:,0]

## Train Model

This is where we actually train the model. I ran it for 200 iterations - more won't likely increase the predictive power of the model, but there are some other diagnostics we can run to see what other improvements we can make.

In [85]:
from sklearn import linear_model
clf = linear_model.SGDRegressor(n_iter=200)
clf.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=200, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

## Evaluate Model

In [86]:
predict = clf.predict(X_test)
predict_neg = predict < 0
error = y_test - predict
error_neg = predict_neg @ error
np.savetxt('modelparams.csv', clf.predict(np.eye(X_test.shape[1])), delimiter=",")
print('R-squared: {:.{p}f}'.format(clf.score(X_test, y_test), p=4))
print('Total error in sales quantity: {:.{p}f}'.format(sum(error), p=0))
print('Total error as a % of actual: {:.{p}f}%'.format(sum(error) / sum(y_test)*100, p=2))
print('Total error in sales quantity with zero min prediction: {:.{p}f}'.format(error_neg, p=0))
print('Total error as a % of actual with zero min prediction: {:.{p}f}%'.format((sum(error)+error_neg) / sum(y_test)*100, p=2))

R-squared: 0.8886
Total error in sales quantity: -7424
Total error as a % of actual: -4.20%
Total error in sales quantity with zero min prediction: 14
Total error as a % of actual with zero min prediction: -4.19%


An overall r-square of 85% is pretty good, I think, for really just dealing with three variables - store and item numbers and historical sales - and the overall error is very low on such a large number of test observations.

In [87]:
predicted = go.Bar(
    name = 'predicted',
    y = clf.predict(X_test)
)

actual = go.Bar(
    name = 'actual',
    y = y_test
)

layout = go.Layout(
    title='Actual vs. Predicted'
)

fig = go.Figure(data=[actual, predicted], layout=layout)
py.iplot(fig, filename='actual-vs-predicted')

Overall this says that this is a pretty good model. The total error over 40,000 individual store-item observations is really low. The graph above shows that we are missing on the big spikes, but overall the performance is good.

## Next Steps

1. Further analyze data
2. Run model diagnostics
3. Analyze model errors by hand
4. Evaluate alternative model hyperparameters
5. Complete Azure implementation

In [88]:
df_final.loc[:100].to_csv('samplemodeldata.csv')

In [None]:
np.eye(X_test.shape[1]) * clf.predict(X_test)