In [None]:
%matplotlib inline
from matplotlib.pylab import plt

import pandas as pd
import numpy as np

In [None]:
air_passengers = pd.read_csv("data/AirPassengers.csv", header = 0, parse_dates = [0], names = ['Month', 'Passengers'], index_col = 0)

In [None]:
air_passengers.head()

In [None]:
air_passengers = air_passengers.to_period()

In [None]:
air_passengers['1950-06-01':'1950-06-03']

In [None]:
air_passengers.plot()

In [None]:
# first thing we look at for a time series when we want to use common analysis techniques:
# is it stationary: constant mean, constant variance, autocovariance does not depend on time

In [None]:
# first test, informally, plot the moving average
air_passengers.rolling(window = 20).mean().plot()

In [None]:
# Can you plot the variance?

In [None]:
# Can you plot the autocovariance?

In [None]:
# How should you size your window?

In [None]:
# next more formally apply the Dickey-Fuller test
from statsmodels.tsa.stattools import adfuller

dftest = adfuller(air_passengers.Passengers, autolag = 'AIC')

In [None]:
dftest

# What do these numbers mean?

In [None]:
# Can we write a function to output these #s sensibly?
# Check out statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.stattools.adfuller.html

In [None]:
# Are we stuck not doing any analysis now that our time series is not stationary? Hint: no.
# How can we make it stationary?
# Why is it non-stationary (2 reasons)

In [None]:
# First let's get rid of the 'trend' contribution to the time series not being stationary
# One method to get rid of a trend is to do a power or log transformation that punishes larger values
# more than smaller values
log_passengers = air_passengers.Passengers.apply(lambda x: np.log(x))
log_passengers.plot()

In [None]:
# This reduces the variance in variance as opposed to the original trend
air_passengers.plot()

In [None]:
# However there is still a trend we need to remove
# Let's calculate a rolling mean
# Experiment with window size
air_passengers.rolling(window = 12).mean().plot()

In [None]:
# What if we subtract this rolling mean from the original series?
rolling_mean = air_passengers.rolling(window = 12).mean()
passengers_detrended = air_passengers - rolling_mean
passengers_detrended.plot()

In [None]:
# Exercise: try detrending after taking the log. How does that look?

In [None]:
# Now let's use a regression rather than a rolling mean to detrend
from statsmodels.regression.linear_model import OLS
model = OLS(air_passengers.Passengers.values, list(range(len(air_passengers.values))))
result = model.fit()
result.params
fit = pd.Series(result.predict(list(range(len(air_passengers.values)))), index = air_passengers.index)

passengers_detrended = air_passengers.Passengers - fit
passengers_detrended.plot()

In [None]:
# Now let's take a look at seasonality


In [None]:
# When might a rolling average work better/worse than a regression?

In [None]:
# How might we be able to eliminate both trend and seasonality?
# Looking for two answers...

In [None]:
# One common technique is differencing, let's start with log_passengers
log_passengers_diff = log_passengers - log_passengers.shift()
log_passengers_diff.plot()

In [None]:
# Let's again test for stationarity with a gut level check.
# And let's write a function to do it since this seems like something we'll have to do a lot

In [None]:
log_passengers = log_passengers.to_timestamp()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

log_passengers.interpolate(inplace = True)
decomposition = seasonal_decompose(log_passengers)

In [None]:
trend = decomposition.trend
seasonal = decomposition.seasonal
residual = decomposition.resid

In [None]:
(seasonal+trend).plot()

In [None]:

plt.subplot(411)
plt.plot(log_passengers, label='Original')
plt.legend(loc='best')
plt.subplot(412)
plt.plot(trend, label='Trend')
plt.legend(loc='best')
plt.subplot(413)
plt.plot(seasonal,label='Seasonality')
plt.legend(loc='best')
plt.subplot(414)
plt.plot(residual, label='Residuals')
plt.legend(loc='best')
plt.tight_layout()