# TSA: Explore

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

In [None]:
plt.rc('figure', figsize=(13, 7))
plt.rc('font', size=16)
plt.style.use('ggplot')

## Store Data

In [None]:
df = pd.read_csv('store_item_demand.csv')
df.sale_date = pd.to_datetime(df.sale_date)
df = df.set_index('sale_date')

We're going to use the last year as the test dataset

In [None]:
df.index.min(), df.index.max()

In [None]:
train = df.loc[:'2016']
test = df.loc['2017']

In [None]:
train.resample('D').sale_amount.mean().plot()
test.resample('D').sale_amount.mean().plot()

In [None]:
y = train.resample('D').sale_amount.mean()

> Plot the daily difference. Observe whether usage seems to vary drastically from day to day or has more of a smooth transition.

In [None]:
y.diff().plot()
plt.title('Daily Sales Amount Difference')
plt.ylabel('Difference in sales from one day to the next')

Maybe it would be helpful to look at percent change as well:

Percent change is defined as the amount of change as a percentage of the old value.

In [None]:
(y.diff() / y.shift()).plot(alpha=.7)
plt.title('Daily % Change in Sales Amount')
plt.ylabel('% Change')

Let's take a look at this week over week and month over month as well.

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True)

weekly = y.resample('W').mean()
(weekly.diff() / weekly.shift()).plot(ax=ax1)
ax1.set(title='Weekly % Change in Sales Total', ylabel='% Change', xlabel='')
ax1.hlines(0, *ax2.get_xlim(), color='black', alpha=.6, ls='--')

monthly = y.resample('M').mean()
(monthly.diff() / monthly.shift()).plot(ax=ax2)
ax2.set(title='Monthly % Change in Sales Total')
ax2.hlines(0, *ax2.get_xlim(), color='black', alpha=.6, ls='--')

(NB There's also a `pct_change` Series method.)

> Plot a time series decomposition.

In [None]:
sm.tsa.seasonal_decompose(y).plot()
None

This doesn't look terribly useful, let's try a different frequency.

In [None]:
sm.tsa.seasonal_decompose(y.resample('W').mean()).plot()
None

> Create a lag plot (day over day).

In [None]:
pd.plotting.lag_plot(y)

Let's look at multiple lag periods for the montly data:

In [None]:
fig, axs = plt.subplots(4, 3, sharex=True, sharey=True)
for ax, lag in zip(axs.ravel(), range(1, 13)):
    pd.plotting.lag_plot(monthly, lag=lag, ax=ax)
    ax.set(title=lag, xlabel='', ylabel='')
fig.tight_layout()

> Run a lag correlation.

In [None]:
pd.DataFrame({
    'y': y,
    'y(t + 1)': y.shift(-1),
    'y(t + 2)': y.shift(-2)
}).corr()

In [None]:
fig, axs = plt.subplots(2, 3, sharex=True, sharey=True)

for ax, lag in zip(axs.ravel(), [1, 12, 26, 45, 52, 76]):
    pd.plotting.lag_plot(weekly, lag=lag, ax=ax)
    ax.set(title='lag = ' + str(lag), xlabel='', ylabel='')

In [None]:
pd.plotting.autocorrelation_plot(y.resample('W').mean())

To compare:

In [None]:
import numpy as np
x = np.random.randn(1000)
pd.plotting.autocorrelation_plot(x)
plt.title('Autocorrelation of Random Noise')

## German Energy Data

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv')
df.Date = pd.to_datetime(df.Date)
print(df.shape)
df.head()

In [None]:
df.fillna(0)

Using the percent cutoff method:

In [None]:
train_pct = .8
n = df.shape[0]
test_start_index = round(n * train_pct)

train = df[:test_start_index]
test = df[test_start_index:]

train = train.set_index('Date').resample('D').mean()
test = test.set_index('Date').resample('D').mean()

In [None]:
train.Consumption.plot()
test.Consumption.plot()

In [None]:
y = train.Consumption

In [None]:
table = y.groupby([y.index.strftime('%Y'), y.index.strftime('%m-%b')]).mean().unstack(0)
sns.heatmap(table, cmap='Blues')

In [None]:
with plt.style.context('seaborn-whitegrid'):
    fig, axs = plt.subplots(2, 6, sharex=True, sharey=True)

fig.subplots_adjust(wspace=0)
fig.tight_layout()
for ax, (month, x) in zip(axs.ravel(), table.T.iteritems()):
    x.plot(ax=ax, title=month[3:])
    ax.hlines(x.mean(), *ax.get_xlim(), color='black', alpha=.5, ls='--')



In [None]:
(y.diff() / y.shift()).plot()
plt.title('Daily % Change in Consumption')

In [None]:
sm.tsa.seasonal_decompose(y.resample('M').mean()).plot()
None

## Bonus

- German Energy Data: show changing source over time
- Store Item Data: visualize different items over time