# Time Series Exploration Exercises

In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import TimeSeriesSplit
import statsmodels.api as sm

import prepare
import acquire

# plt.style.use('seaborn-whitegrid')
plt.rc('figure', figsize=(13, 7))
plt.rc('font', size=16)

### Store Data

In [None]:
# read in and prepare the store data:
df = acquire.wrangle_store_data()
df = prepare.prep_store_data(df)

In [None]:
df.head()

In [None]:
# look at max and min dates
df.index.min(), df.index.max()

In [None]:
# Train test split:

train = df[:'2016']
test = df['2017']

In [None]:
# Visualize the split data:

train.resample('D').sales_total.mean().plot()
test.resample('D').sales_total.mean().plot()

In [None]:
# y is the resampled daily data:

y = train.resample('D').sales_total.mean()
y.head()

In [None]:
# Plot the weekly average & the 7-day moving average. Compare the 2 plots.

y.resample('W').mean().plot(label='Resampled Weekly', alpha=.6, color = 'blue')
y.rolling(7).mean().plot(label='Rolling Weekly Average', alpha=.6, color = 'red')
plt.legend()

In [None]:
# Plot the daily difference. 
# Observe whether usage seems to vary drastically from day to day or has more of a smooth transition.

y.diff().plot()
plt.title('Daily Sales Difference')
plt.ylabel('Difference in sales from one day to the next')

In [None]:
# Maybe it would be helpful to look at percent change as well:

(y.diff() / y.shift()).plot(alpha=.5)
plt.title('Daily % Change in Sales Total')
plt.ylabel('% Change')

Let's take a look at this week over week and month over month as well.

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, sharey=True)

weekly = y.resample('W').mean()
(weekly.diff() / weekly.shift()).plot(ax=ax1)
ax1.set(title='Weekly % Change in Sales Total', ylabel='% Change', xlabel='')

monthly = y.resample('M').mean()
(monthly.diff() / monthly.shift()).plot(ax=ax2)
ax2.set(title='Monthly % Change in Sales Total')
# ax2.hlines(0, *ax2.get_xlim())

In [None]:
y.resample('W').mean()

In [None]:
# Autocorrelation after weekly resampling:

pd.plotting.autocorrelation_plot(y.resample('W').mean())

We can see the yearly seasonality in the autocorrelation plot above

In [None]:
# decompose the data
sm.tsa.seasonal_decompose(y.resample('W').mean()).plot()
None

****
#### Using your OPS data you prepped in lesson 2 exercises:

In [None]:
power = pd.read_csv("https://raw.githubusercontent.com/jenfly/opsd/master/opsd_germany_daily.csv")
power = prepare.prep_opsd_data(power)

In [None]:
power.head()

In [None]:
# How many nulls in the data?
power.isnull().sum()

In [None]:
# split the data:
    
train = power[:'2013']
train.head()


In [None]:
# plot the consumption, wind and solar data

train.consumption.plot(label="consumption")
plt.plot(train['consumption'].rolling('90D').mean(), label="Rolling Mean")
train.wind.plot()
train.solar.plot()
plt.legend()

In [None]:
# lets look at consumption in a single year and see if we see any pattern
plt.figure(figsize=(12,8))
plt.plot(train.loc['2010-01':'2010-12']['consumption'], linewidth = 0.5)
plt.title("consumption in 2010")
plt.show()

There is a weekly pattern (i.e. within one week) and pattern within weeks of a month

In [None]:
# resample monthly and plot

train.consumption.resample('M').mean().plot()

In [None]:
# resample data on yearly basis and plot
train.consumption.resample('Y').mean().plot()
plt.ylim(1100,1400)

In [None]:
# lets fill all NaNs with zero
train = train.fillna(0)

In [None]:
# make a new datafrsame sources
sources = train[['consumption', 'wind', 'solar']]

# make a new column called conventional: Consumption - Wind - Solar
sources['conventional'] = sources.consumption - sources.wind - sources.solar

In [None]:
# lets plot the three power sources as stacked bar chart:

ax = sources[['conventional', 'wind', 'solar']].resample('Y').sum().plot.bar(stacked = True)
ax.set_ylim(0, 700000)
ax.legend(ncol = 3)

labels = [pd.to_datetime(t.get_text()).strftime('%Y') for t in ax.get_xticklabels()]
ax.set_xticklabels(labels);

#### Plot the weekly average & the 7-day moving average. Compare the 2 plots.

In [None]:
train.consumption.resample('W').mean().plot()
train.consumption.rolling(7).mean().plot()

They are virtually the same.

#### Group the electricity consumption time series by month of year, to explore annual seasonality.

In [None]:
train['month'] = train.index.month
train['weekday'] = train.index.day_name()

In [None]:
# Energy consumption by month
sns.boxplot(data = train, x = 'month', y = 'consumption')


Consumption drops in summer months

In [None]:
# Energy consumption by weekday
sns.boxplot(data = train, x = 'weekday', y = 'consumption')

Seems like consumption drops on weekend

#### Subseasonal Plot

In [None]:
y = train.consumption
table = y.groupby([y.index.strftime('%Y'), y.index.strftime('%m-%b')]).mean().unstack()

table.head()

In [None]:
fig, axs = plt.subplots(1, 12, sharey=True, sharex=True, figsize=(25,8))
for ax, (month, subset) in zip(axs, table.iteritems()):
    subset.plot(ax=ax, title=month)
    x_left, x_right = ax.get_xlim()
    ax.hlines(subset.mean(), x_left, x_right, ls='--')
    ax.set(xlabel='')

fig.suptitle('Seasonal Subseries Plot') # super-title for the overall figure
fig.subplots_adjust(wspace=0)

In [None]:
fig, axs = plt.subplots(3, 4, sharex=False, sharey=True, figsize=(25,8))

for ax, (month, subset) in zip(axs.ravel(), table.iteritems()):
    subset.plot(ax=ax, title=month[3:])
    ax.hlines(subset.mean(), *ax.get_xlim(), ls = '--')
fig.subplots_adjust(wspace=0)
plt.tight_layout()

In [None]:
# How does the wind production/consumption vary within a year and also year to year?

ax = train['2010':].wind.resample('M').sum().plot()
ax.set(title = 'Wind production pattern')

Wind production drops in summer months

In [None]:
# How does the solar production/consumption vary within a year and also year to year?


ax = train['2010':].solar.resample('M').sum().plot()
ax.set(title = 'Solar production pattern')

Solar production peaks in summer

In [None]:
# bring in the GDP growth rate data

growth_rate = {2006:3.815, 2007:2.984, 2008:0.962, 2009:-5.697, 2010:4.179, 2011:3.924, 2012:0.42, 2013:0.428, 2014:2.226 , 2015:1.74 , 2016:2.23 , 2017:2.465 }

In [None]:
# convert dict to dataframe
growth_rate = pd.DataFrame.from_dict(growth_rate, orient='index', columns = ['growth_rate'])
growth_rate.head()

In [None]:
# set index as datetimeindex
growth_rate.index = pd.to_datetime(growth_rate.index, format = '%Y' )

In [None]:
# check the dataframe hear
growth_rate.head()

In [None]:
# compare power consumption and GDP growth rate:

fig, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(16, 9))

ax = power.consumption.resample('Y').sum().plot(figsize = (12,6), ax = ax1)
ax1.set(title='Power Consumption')
ax1.set_ylabel('Power Consumption')

growth_rate.plot(ax =ax2)
ax2.set(title='GDP Growth Rate')
ax2.set_ylabel('Growth Rate')

#### Plot a time series decomposition. Takeaways?

In [None]:
decomposition = sm.tsa.seasonal_decompose(train.resample('W').consumption.mean(), model='additive')

In [None]:
decomposition.plot()
None

There is a annual seasonality. The energy consumption drops during December

In [None]:
# autocorrelation plot 
pd.plotting.autocorrelation_plot(train.consumption.resample('W').mean())


We can see then annual pattern in the autocorrelation plot above

In [None]:
# Is there a weekly seasonality in this data?
pd.plotting.autocorrelation_plot(train.consumption['2010-01': '2010-03'])

Yes!