In [347]:
from pandas import DataFrame, read_csv
import pandas as pd
import numpy as np
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go


## Import and format data

In [285]:
df = pd.read_csv('SEG_Saleshistory_Stores.csv')
df.columns = ['date', 'store', 'item', 'sales']
df.date = pd.to_datetime(df.date)
df.item = df.item.astype('category')
df.store = df.store.astype('category')
df.dtypes

date     datetime64[ns]
store          category
item           category
sales             int64
dtype: object

Below are some summary statistics on the data. Overall the quantities for individual items at individual stores is quite small. It would be difficult to forecast daily quantities of individual items and individual stores, so we will work towards forecasting weekly item sales at individual stores.

In [237]:
df.describe(include='all')

Unnamed: 0,date,store,item,sales
count,425575,425575,425575.0,425575.0
unique,342,181,39.0,
top,2016-11-05 00:00:00,SEGWD7,41795.0,
freq,4110,6111,27516.0,
first,2016-02-04 00:00:00,,,
last,2017-01-11 00:00:00,,,
mean,,,,1.670345
std,,,,1.112165
min,,,,1.0
25%,,,,1.0


Let's take a look at a few cuts on the data to see if we can spot any trends. Below are plots of a handful of individual stores' sales. It looks like the answer to the question about the big jump in sales in September is the addition of a good number of stores.

### Total Sales

In [286]:
df_total = df.groupby(pd.Grouper(freq='W', key='date')).sum().fillna(0).unstack('date', 0)
df_total.index.levels[1]

len(df_total) == len(df_total.index.levels[1])

trace = go.Scatter(
    x = df_total.index.levels[1],
    y = df_total
)

layout = go.Layout(
    title='Total Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='total-sales')

### Individual Store Sales

In [391]:
df_1w = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)

rows = 10
cols = 3
spidx = np.arange(rows*cols).reshape(rows,cols)

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]



### New Stores

In [289]:
store_sales = df.groupby(['store']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
stores_with_sales = store_sales['sales'].where(store_sales.sales > 0).count()

stores_with_sales.index

trace = go.Bar(
    x = stores_with_sales.index,
    y = stores_with_sales
)

layout = go.Layout(
    title='No. of Stores with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='stores-with-sales')

### Items

In [392]:
df_1w = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date', 0)
rows = 13
cols = 3

fig = tools.make_subplots(rows=rows, cols=cols, shared_yaxes=True, subplot_titles=df_1w.index[:rows*cols])

spidx = np.arange(rows*cols).reshape(rows,cols)

for i in range(rows):

    for j in range(cols):

        trace = go.Scatter(
            x = df_1w.iloc[1].index.levels[1],
            y = df_1w.iloc[spidx[i,j]],
        )

        fig.append_trace(trace, i+1, j+1)

fig['layout'].update(height=250*rows, title='Sales by Store', showlegend=False);
py.iplot(fig, filename='sales-by-store')

This is the format of your plot grid:
[ (1,1) x1,y1 ]     [ (1,2) x2,y1 ]     [ (1,3) x3,y1 ]   
[ (2,1) x4,y2 ]     [ (2,2) x5,y2 ]     [ (2,3) x6,y2 ]   
[ (3,1) x7,y3 ]     [ (3,2) x8,y3 ]     [ (3,3) x9,y3 ]   
[ (4,1) x10,y4 ]    [ (4,2) x11,y4 ]    [ (4,3) x12,y4 ]  
[ (5,1) x13,y5 ]    [ (5,2) x14,y5 ]    [ (5,3) x15,y5 ]  
[ (6,1) x16,y6 ]    [ (6,2) x17,y6 ]    [ (6,3) x18,y6 ]  
[ (7,1) x19,y7 ]    [ (7,2) x20,y7 ]    [ (7,3) x21,y7 ]  
[ (8,1) x22,y8 ]    [ (8,2) x23,y8 ]    [ (8,3) x24,y8 ]  
[ (9,1) x25,y9 ]    [ (9,2) x26,y9 ]    [ (9,3) x27,y9 ]  
[ (10,1) x28,y10 ]  [ (10,2) x29,y10 ]  [ (10,3) x30,y10 ]
[ (11,1) x31,y11 ]  [ (11,2) x32,y11 ]  [ (11,3) x33,y11 ]
[ (12,1) x34,y12 ]  [ (12,2) x35,y12 ]  [ (12,3) x36,y12 ]
[ (13,1) x37,y13 ]  [ (13,2) x38,y13 ]  [ (13,3) x39,y13 ]



### New Items

In [264]:
item_sales = df.groupby(['item']+[pd.Grouper(freq='W', key='date')]).sum().fillna(0).unstack('date')
items_with_sales = item_sales['sales'].where(item_sales.sales > 0).count()

items_with_sales.index

trace = go.Bar(
    x = items_with_sales.index,
    y = items_with_sales
)

layout = go.Layout(
    title='No. of Items with Sales'
)


fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename='items-with-sales')

## Prepare Data

#### Trim date range to have consistent history

But before we do that, let's trim the range of the data to go from 2016-03-01 to 2016-12-15 in order to avoid anomolies in the data evident in the total sales graph above. We're going to trim to an approximate range here to reduce the amount of data we have to process. Once we aggregate into the order periods we will trim so that we end up with whole periods on either end of our data.

In [290]:
df = df[(df['date'] > '2016-02-15') & (df['date'] < '2016-12-30')]

#### Add records with zero sales so there is a record for every period for every store-item combination

We need to make sure that we add records with zero sales for any combination of store, item and period that doesn't appear in the dataset so that our trailing averages are calculated correctly.

In [291]:
import itertools

beg_date = df['date'].min()
end_date = df['date'].max()
dates = pd.DatetimeIndex(start=beg_date, end=end_date, freq='D')

items = df['item'].value_counts().index
stores = df['store'].value_counts().index
all_periods = pd.DataFrame(list(itertools.product(dates, stores, items)), columns=['date', 'store', 'item'])

all_periods['sales'] = 0
all_periods.date = pd.to_datetime(df.date)
all_periods.item = df.item.astype('category')
all_periods.store = df.store.astype('category')

incl_periods = df.groupby(['store', 'item', 'date']).sum().fillna(0).reset_index()

df_all = pd.concat([all_periods, df]).groupby(['store', 'item', 'date']).sum().fillna(0).reset_index()
len(df_all)

2237703

#### Add Columns for Periods

This time we also want to predict sales for order periods of two and three times per week as well as weekly, assuming orders are placed on the same day each week.

For the **two** orders per week periods, we will predict sales from:
* Tuesday through Thursday (days 2 through 4)
* Friday through Monday (days 5, 6, 0 and 1)

For the **three** orders per week periods, we will predict sales from:
* Monday through Wednesday (days 1 through 3)
* Thursday and Friday (days 4 and 5)
* Saturday and Sunday (days 6 and 1)

In order to aggregate sales over the correct periods we need to add columns to represent the series for each order period. The end of the two orders per week periods are created in the column `freq2_end` and the three orders per week periods are in `freq3_end`.

We are also adding in columns to distinguish between the intra-weekly periods (which have a zero index) in order to allow the model to compensate for differences in sales volumes between intra-weekly periods, which are `freq2_per` and `freq3_per`.

In [399]:
df_f = df_all.copy()

# Assign each record to its respective intra-week group.
#
# These are the intra-week periods that each day of the week belongs to.
freq = list([[0, 0, 1, 1, 1, 0, 0],
            [0, 1, 1, 1, 2, 2, 0]
            ])

# Map the day of the week of each record to its respective intra-week period.
for i, f in enumerate(freq):
    df_f['freq' + str(i + 2) + '_per'] = df_f['date'].dt.weekday.map(pd.Series(f))

# Assign each record to its respective group within each series of intra-week groups.
# Group membership is indicated in a separate column by the end date of the group.
#
#     1. Calculate the numeric day of the week for each date in the range of dates
#        in the data.
#     2. Create a boolean array with an entry for each record indicating whether
#        the date of the record falls on a day of the week on which an intra-week
#        period ends.
#     3. Calculate the cumulative sum of the boolean array for the range of dates, which
#        will then represent the sequential period each date in our range belongs to.
#     4. Index the cumulative sums by the range of dates to create a lookup table.
#     5. Map the 'date' column in our data to the sequence number using the lookup table.
#     6. Group the lookup table by the period, aggregating the date column by max, which
#        represents the end date of each sequential period, to create another lookup table.
#     7. Map the sequence number series we created earlier to the period ending date using
#        the new lookup table and add it to our data frame.
#

# These are the days of the week that new periods begin on for order frequencies of two
# and three times per week.
period_ends = list([[2, 5],
                   [1, 4, 6]
                   ])

# Execute the same process for each of our order frequencies
for i, p in enumerate(period_ends):
    # Steps 1 through 4
    periods = pd.Series(dates.weekday).isin(period_ends[i]).cumsum()
    date_lookup = pd.DataFrame({'date': dates, 'period': periods})
    date_lookup.set_index('date', inplace=True)
    
    # Step 5
    seq_col = df_f['date'].map(date_lookup.period)

    # Step 6
    period_lookup = date_lookup.reset_index().groupby('period').max()
    period_lookup.to_csv('freq' + str(i+2) + '.csv')

    # Step 7
    df_f['freq' + str(i+2) + '_end'] = seq_col.map(period_lookup.date)

df_f.head(10)

Unnamed: 0,store,item,date,sales,freq2_per,freq3_per,freq2_end,freq3_end
0,SEGWD103,41774,2016-02-16,0.0,0,1,2016-02-16,2016-02-18
1,SEGWD103,41774,2016-02-17,1.0,1,1,2016-02-19,2016-02-18
2,SEGWD103,41774,2016-02-18,1.0,1,1,2016-02-19,2016-02-18
3,SEGWD103,41774,2016-02-19,1.0,1,2,2016-02-19,2016-02-20
4,SEGWD103,41774,2016-02-20,0.0,0,2,2016-02-23,2016-02-20
5,SEGWD103,41774,2016-02-21,0.0,0,0,2016-02-23,2016-02-22
6,SEGWD103,41774,2016-02-22,0.0,0,0,2016-02-23,2016-02-22
7,SEGWD103,41774,2016-02-23,0.0,0,1,2016-02-23,2016-02-25
8,SEGWD103,41774,2016-02-24,0.0,1,1,2016-02-26,2016-02-25
9,SEGWD103,41774,2016-02-25,0.0,1,1,2016-02-26,2016-02-25


That looks like its working right.

#### Calculating trailing averages
Adding in the rolling average sales is more complicated because we want to calculate the average of like intra-week periods. For example, for the two orders per week frequency the first period spans from Friday through Monday. When we do our trailing averages we want to only include trailing Monday through Friday periods and exclude the Tuesday through Thursday periods that would be included in a strictly sequential calculation. 

In [460]:
# This function calculates the trailing average for a given order frequency.
def calc_trailing(orders_per_week):
    # Check to make sure orders per week is in the available range
    if orders_per_week not in [2, 3]:
        print('Orders per week must be either 2 or 3.')
        raise
    
    freq_per = 'freq' + str(orders_per_week) + '_per'
    freq_end = 'freq' + str(orders_per_week) + '_end'
    freq_end_avg = freq_end + '_avg'
    
    f = {'sales': 'sum', freq_per: 'mean'}
    g = ['store', 'item', freq_end]
    
    # Here we filter the data frame for each of the intra-week periods in
    # the specified order frequency and perform the trailing average and
    # calculations on them separately.
    df_final = pd.DataFrame()
    for i, n in enumerate(df_f[freq_per].value_counts().index):
        df_model = df_f[df_f[freq_per] == n].groupby(g).agg(f).fillna(0)
        
        rolling_sum = (df_model
                       .apply(lambda x:x.rolling(window=1).mean())
                       .shift(1)
                      )

        df_model[freq_end_avg] = rolling_sum['sales']
        df_final = df_final.append(df_model.reset_index())
          
    return df_final.groupby(g).sum()

In [461]:
df_final = calc_trailing(3)
df_final.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,freq3_per,sales,freq3_end_avg
store,item,freq3_end,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
SEGWD103,41774,2016-02-18,1,2.0,
SEGWD103,41774,2016-02-20,2,1.0,
SEGWD103,41774,2016-02-22,0,0.0,
SEGWD103,41774,2016-02-25,1,0.0,2.0
SEGWD103,41774,2016-02-27,2,2.0,1.0
SEGWD103,41774,2016-02-29,0,0.0,0.0
SEGWD103,41774,2016-03-03,1,1.0,0.0
SEGWD103,41774,2016-03-05,2,2.0,2.0
SEGWD103,41774,2016-03-07,0,1.0,0.0
SEGWD103,41774,2016-03-10,1,3.0,1.0


### Remove Missing Stores and Items

In [462]:
def remove_missing(df_final):
    beg_len = len(df_final)
    df_final['cum_sales'] = df_final.groupby(level=[0,1]).cumsum()['sales']
    df_final_masked = df_final[df_final.cum_sales != 0].dropna(how='any')
    
    print('{} records removed'.format(beg_len - len(df_final_masked)))
    print('{} records remaining'.format(len(df_final_masked)))
    
    return df_final_masked.reset_index()

In [463]:
df_final_masked = remove_missing(df_final)
store = 'SEGWD104'
item = 41783
df_final_masked.query('(store == @store) & (item == @item)').head()

481729 records removed
478295 records remaining


Unnamed: 0,store,item,freq3_end,freq3_per,sales,freq3_end_avg,cum_sales
4496,SEGWD104,41783,2016-08-29,0,1.0,0.0,1.0
4497,SEGWD104,41783,2016-09-01,1,2.0,0.0,3.0
4498,SEGWD104,41783,2016-09-03,2,1.0,0.0,4.0
4499,SEGWD104,41783,2016-09-05,0,4.0,1.0,8.0
4500,SEGWD104,41783,2016-09-08,1,1.0,2.0,9.0


### Encode Store and Item Categories

In [464]:
# We have some extra logic here to deal with needing to categorize the three order
# per week variable, but not the two, which is already binary.
def encode_cat_vars(df_final_masked):
  
    # Make sure sales is the first column
    cols = df_final_masked.columns.tolist()
    cols.remove('sales')
    cols = ['sales'] + cols    
    df_final_masked = df_final_masked[cols]
    
    # Initial list of columns to drop
    drop_cols = ['cum_sales', 41793, 'SEGWD103']

    # Initial dummy variables
    stores = pd.get_dummies(df_final_masked['store'])
    items = pd.get_dummies(df_final_masked['item'])
    concat_tables = [df_final_masked, stores, items]
    
    # Create dummy variables for freq3_end if necessary
    try:
        freq = cols[cols.index('freq2_end')][:5]
    except:
        freq = 'freq3'
        freq3_d = pd.get_dummies(df_final_masked['freq3_per'], prefix='freq3_per')
        concat_tables.append(freq3_d)
        drop_cols.append(freq + '_per_2')
       
    # Add dummy variables
    df_final_masked = pd.concat(concat_tables, axis=1)
    
    # Drop columns and return
    return df_final_masked.drop(drop_cols, axis=1).sort_values(freq + '_end')

In [465]:
data = encode_cat_vars(df_final_masked)
data.head()

Unnamed: 0,sales,store,item,freq3_end,freq3_per,freq3_end_avg,SEGWD104,SEGWD116,SEGWD12,SEGWD123,...,42045,42046,42047,42048,42049,42050,42051,42052,freq3_per_0,freq3_per_1
28015,3.0,SEGWD138,41793,2016-02-18,1,8.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
408586,2.0,SEGWD671,41791,2016-02-18,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
408450,1.0,SEGWD671,41790,2016-02-18,1,1.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
408314,2.0,SEGWD671,41789,2016-02-18,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
408178,2.0,SEGWD671,41788,2016-02-18,1,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


### Split Data for Training and Testing

This time we'll train the model with data through the end of October and then use the remaining data to 
test the model. This is where we also do the final trimming of the data to ensure it includes on whole intra-week periods.

In [466]:
# Values are begining of training, end of training, end of testing and
# represent the end of the respective intra-week order period.
date_range = dict(
    freq2_end=['2016-03-01', '2016-10-28', '2016-12-23'],
    freq3_end=['2016-03-03', '2016-10-29', '2016-12-24']
)

freq_avg = data.columns.tolist()[5]
freq = data.columns.tolist()[3]
beg_train, end_train, end_test = date_range[freq]

# Add constant
data['const'] = 1

# Normalize average sales variable.
mu = data[freq_avg].mean()
std = data[freq_avg].std()
data[freq_avg] = (data[freq_avg] - mu) / std

data_train = data[data[freq].ge(beg_train) & data[freq].le(end_train)]
data_test = data[data[freq].gt(end_train) & data[freq].le(end_test)]

X_train = data_train.iloc[:,5:].as_matrix()
y_train = data_train.iloc[:,0].as_matrix()

X_test = data_test.iloc[:,5:].as_matrix()
y_test = data_test.iloc[:,0].as_matrix()


## Train Model

This is where we actually train the model. I ran it for 200 iterations - more won't likely increase the predictive power of the model, but there are some other diagnostics we can run to see what other improvements we can make.

In [467]:
from sklearn import linear_model
clf = linear_model.SGDRegressor(n_iter=200)
clf.fit(X_train, y_train)

SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', n_iter=200, penalty='l2', power_t=0.25,
       random_state=None, shuffle=True, verbose=0, warm_start=False)

## Evaluate Model

In [468]:
predict = clf.predict(X_test)
predict_neg = predict < 0
error = predict - y_test
error_neg = predict_neg @ error
np.savetxt('modelparams.csv', clf.predict(np.eye(X_test.shape[1])), delimiter=",")
print('R-squared: {:.{p}f}'.format(clf.score(X_test, y_test), p=4))
print('Total error in sales quantity: {:.{p}f}'.format(sum(error), p=0))
print('Total error as a % of actual: {:.{p}f}%'.format(sum(error) / sum(y_test)*100, p=2))
print('Total error in sales quantity with zero min prediction: {:.{p}f}'.format(sum(error)-error_neg, p=0))
print('Total error as a % of actual with zero min prediction: {:.{p}f}%'.format((sum(error)-error_neg) / sum(y_test)*100, p=2))

R-squared: 0.4832
Total error in sales quantity: 13040
Total error as a % of actual: 7.62%
Total error in sales quantity with zero min prediction: 15973
Total error as a % of actual with zero min prediction: 9.33%


This is still quite a good fit overall. As you would expect, it is not quite as accurate as the model that was based on seven day periods.

In [469]:
data_predict = data_test.iloc[:,:5]
data_predict[freq_avg] = data[freq_avg] * std + mu
data_predict['predict'] = predict
data_predict['variance'] = predict - data_predict['sales']
data_predict.head()

Unnamed: 0,sales,store,item,freq3_end,freq3_per,freq3_end_avg,predict,variance
394699,2.0,SEGWD65,41792,2016-10-31,0,2.0,2.471349,0.471349
284796,0.0,SEGWD2366,42043,2016-10-31,0,0.0,0.092782,0.092782
435873,0.0,SEGWD70,41799,2016-10-31,0,0.0,0.044274,0.044274
250615,6.0,SEGWD2306,41791,2016-10-31,0,1.0,1.691491,-4.308509
355037,4.0,SEGWD5,41791,2016-10-31,0,0.0,2.0562,-1.9438


In [470]:
py.iplot(data_predict.groupby(freq).sum()[['sales', 'predict']].iplot(asFigure=True,
                               kind='bar',xTitle='Dates',yTitle='Sales',title='Actual vs. Predicted'))

In [471]:
data_predict.groupby(freq).sum()[['sales', 'predict', 'variance']]

Unnamed: 0_level_0,sales,predict,variance
freq3_end,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-10-31,5723.0,6504.32005,781.32005
2016-11-03,9128.0,9743.936035,615.936035
2016-11-05,10153.0,6612.880036,-3540.119964
2016-11-07,6025.0,6391.231122,366.231122
2016-11-10,9250.0,9919.641314,669.641314
2016-11-12,6259.0,8091.76443,1832.76443
2016-11-14,6000.0,6505.838022,505.838022
2016-11-17,9387.0,9965.939466,578.939466
2016-11-19,6564.0,6614.018515,50.018515
2016-11-21,6015.0,6496.350696,481.350696


In [472]:
pivot = pd.pivot_table(data_predict,index=['store','item'],columns=[freq],values=['sales', 'predict', 'variance'],aggfunc=[np.sum])
pivot.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum,sum
Unnamed: 0_level_1,Unnamed: 1_level_1,sales,sales,sales,sales,sales,sales,sales,sales,sales,sales,...,variance,variance,variance,variance,variance,variance,variance,variance,variance,variance
Unnamed: 0_level_2,freq3_end,2016-10-31,2016-11-03,2016-11-05,2016-11-07,2016-11-10,2016-11-12,2016-11-14,2016-11-17,2016-11-19,2016-11-21,...,2016-12-03,2016-12-05,2016-12-08,2016-12-10,2016-12-12,2016-12-15,2016-12-17,2016-12-19,2016-12-22,2016-12-24
store,item,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3,Unnamed: 22_level_3
SEGWD103,41774,3.0,3.0,0.0,2.0,1.0,2.0,0.0,2.0,1.0,2.0,...,0.258669,0.255629,0.985904,1.258669,0.255629,1.606411,-1.120824,0.255629,0.226918,1.638162
SEGWD103,41775,0.0,2.0,2.0,1.0,3.0,0.0,1.0,2.0,0.0,0.0,...,-2.291824,1.084629,0.435411,-1.153345,0.705136,0.435411,-0.153345,-0.294864,1.435411,1.467162
SEGWD103,41776,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,...,0.478322,0.475282,0.826064,0.478322,-0.524718,0.826064,0.478322,0.854775,-0.173936,0.478322
SEGWD103,41777,3.0,0.0,4.0,0.0,1.0,1.0,0.0,5.0,0.0,2.0,...,-0.424586,-0.668639,-0.076843,0.954908,0.331361,-0.69735,-0.424586,0.951868,-0.317857,0.954908
SEGWD103,41778,0.0,2.0,1.0,0.0,2.0,0.0,1.0,4.0,1.0,1.0,...,-0.433597,0.563363,0.673131,-0.054104,-1.436637,-0.706362,0.945896,-0.677651,-0.326869,0.566403
SEGWD103,41779,1.0,1.0,0.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0,...,-0.313058,0.442889,-1.206329,0.066435,0.063396,0.173164,0.066435,-0.936604,-1.206329,1.066435
SEGWD103,41780,1.0,4.0,1.0,0.0,2.0,0.0,0.0,3.0,2.0,0.0,...,-0.304023,-0.307063,-1.576787,-0.92453,-0.927569,0.182199,0.454963,0.451924,0.802706,-0.92453
SEGWD103,41781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.071627,-0.074667,0.276116,-0.071627,-0.074667,0.276116,-0.071627,-0.074667,0.276116,-0.071627
SEGWD103,41782,1.0,1.0,1.0,3.0,1.0,2.0,1.0,1.0,2.0,1.0,...,1.584037,0.822011,1.311273,-2.174949,-0.177989,1.552286,-0.03647,0.201504,0.172793,-1.415963
SEGWD103,41783,2.0,3.0,3.0,0.0,3.0,2.0,2.0,2.0,1.0,2.0,...,-0.903262,0.714205,1.203466,-1.523769,-2.285795,-2.55552,0.855724,0.852684,1.582959,1.096738


In [473]:
pivot.to_csv('pivot_' + freq + '_v2.csv')

## Improvement Opportunities
* Run model diagnostics to understand bias and variance.
* Analyze the errors (cut by store, by item, by period to try to identify areas where the errors are systematically larger. Explore segmenting or grouping stores and/or products.
* Longer history of sales going back at least another year to be able to factor in normal seasonality.
* Evaluation alternative optimization algorighthms beyond regression.
