In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller

from pmdarima import auto_arima

import warnings
warnings.filterwarnings("ignore")

from pylab import rcParams
%matplotlib inline
rcParams['figure.figsize'] = 10, 5

## Load data, update header, and decompose

In [None]:
file = 'airline-passengers.csv'
airline = pd.read_csv(file)
airline['Month'] = pd.to_datetime(airline['Month'])
airline.set_index('Month', inplace=True)
airline.shape

In [None]:
airline.head()

In [None]:
airline.describe()

In [None]:
airline.plot();

In [None]:
decomp = seasonal_decompose(airline['Passengers'])

In [None]:
decomp.plot();

In [None]:
plot_acf(decomp.seasonal);

In [None]:
n = 132
airline_train = airline[['Passengers']][:n]
airline_test = airline[['Passengers']][n:]

## Determine if data is stationary

In [None]:
test_result = adfuller(airline['Passengers'])
test_result

In [None]:
def adfuller_test(Passengers):
    """ This is not my function; find source.
    """
    result = adfuller(Passengers)
    labels = ['ADF Test Statistic', 'p-value', '#Lags Used', 'Number of Observations']
    for value, label in zip(result, labels):
        print(f'{label} : {str(value)}')

    if result[1] <= 0.05:
        print("strong evidence against the null hypothesis(Ho), reject the null hypothesis. Data is stationary")
    else:
        print("weak evidence against null hypothesis,indicating it is non-stationary ")

In [None]:
adfuller_test(airline['Passengers'])

In [None]:
pd.plotting.autocorrelation_plot(airline['Passengers']);

In [None]:
# Determine seasonality a priori modeling
airline['First Difference'] = airline['Passengers'] - airline['Passengers'].shift(1)
airline['Seasonal First Difference'] = airline['Passengers'] - airline['Passengers'].shift(12)
airline['Second Difference'] = airline['First Difference'] - airline['First Difference'].shift(1)
airline.head()

In [None]:
# Test if stationary by day
adfuller_test(airline['First Difference'].dropna())

In [None]:
airline['First Difference'].plot();

In [None]:
# Plot (partial) autocorrelation function for seasonal difference
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(211)
fig = plot_acf(airline['First Difference'].dropna(),lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(airline['First Difference'].dropna(),lags=40, ax=ax2, method='ywm')
plt.tight_layout()

In [None]:
# Test if stationary by year
adfuller_test(airline['Seasonal First Difference'].dropna())

In [None]:
airline['Seasonal First Difference'].plot();

In [None]:
# Plot (partial) autocorrelation function for seasonal difference
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(211)
fig = plot_acf(airline['Seasonal First Difference'].dropna(),lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(airline['Seasonal First Difference'].dropna(),lags=40, ax=ax2, method='ywm')
plt.tight_layout()

In [None]:
# Test if stationary by delta day
adfuller_test(airline['Second Difference'].dropna())

In [None]:
airline['Second Difference'].plot();

In [None]:
# Plot (partial) autocorrelation function for delta day
fig = plt.figure(figsize=(8,4))
ax1 = fig.add_subplot(211)
fig = plot_acf(airline['Second Difference'].dropna(),lags=40, ax=ax1)
ax2 = fig.add_subplot(212)
fig = plot_pacf(airline['Second Difference'].dropna(),lags=40, ax=ax2, method='ywm')
plt.tight_layout()

## Build ARIMA model (non seasonal)

In [None]:
# For non-seasonal data
#p=1, d=1, q=0 or 1
model1 = ARIMA(airline_train['Passengers'], order=(2,1,2)).fit()

split = airline_train.shape[0]
end = airline_train.shape[0] + 24

airline_train['ARIMA1'] = model1.predict(start=0, end=split, dynamic=False)
airline_test['ARIMA1'] = model1.predict(start=split, end=end, dynamic=True)

In [None]:
plt.title('ARIMA1')
plt.plot(airline_train['Passengers'], label='train')
plt.plot(airline_test['Passengers'], label='test')
plt.plot(airline_train['ARIMA1'], color='C2', label='ARIMA1')
plt.plot(airline_test['ARIMA1'], color='C2')
plt.legend()

## Build ARIMA model (seasonal)

In [None]:
# May have bugs

# Fit a SARIMAX(0, 1, 1)x(2, 1, 1, 12) on the training set
#from statsmodels.tsa.statespace.sarimax import SARIMAX

#import statsmodels.api as sm
#model = sm.tsa.statespace.SARIMAX(airline['Passengers'],order=(1,1,1),seasonal_order=(1,1,1,12))
#order = (0,0,0) introduces increasing aspect

#from pandas.tseries.offsets import DateOffset
#future_dates=[airline.index[-1]+ DateOffset(months=x)for x in range(0,480)]
#future_datest_airline=pd.DataFrame(index=future_dates[1:],columns=airline.columns)

#future_datest_airline.tail()
#future_airline=pd.concat([airline,future_datest_airline])
#future_airline['forecast'] = results.predict(start = airline.shape[0], end = airline.shape[0]+480, dynamic= True)
#future_airline[['Passengers', 'forecast']].plot(figsize=(12, 8))

In [None]:
model2 = ARIMA(airline_train['Passengers'], order=(1,1,1), seasonal_order=(1,1,1,12)).fit()
airline_train['ARIMA2'] = model2.predict(start=0, end=split, dynamic=False)
airline_test['ARIMA2'] = model2.predict(start=split, end=end, dynamic=True)

In [None]:
plt.title('ARIMA2')
plt.plot(airline_train['Passengers'], label='train')
plt.plot(airline_test['Passengers'], label='test')
plt.plot(airline_train['ARIMA2'], color='C3', label='ARIMA2')
plt.plot(airline_test['ARIMA2'], color='C3')
plt.legend()

In [None]:
model3 = ARIMA(airline_train['Passengers'], order=(1,2,1), seasonal_order=(1,0,0,12)).fit()
airline_train['ARIMA3'] = model3.predict(start=0, end=split, dynamic=False)
airline_test['ARIMA3'] = model3.predict(start=split, end=end, dynamic=True)

In [None]:
plt.title('ARIMA3')
plt.plot(airline_train['Passengers'], label='train')
plt.plot(airline_test['Passengers'], label='test')
plt.plot(airline_train['ARIMA3'], color='C4', label='ARIMA3')
plt.plot(airline_test['ARIMA3'], color='C4')
plt.legend()

## AutoARIMA

In [None]:
# Fit auto_arima function to AirPassengers dataset
stepwise_fit = auto_arima(airline_train['Passengers'], start_p=1, start_q=1, max_p=3, max_q=3, m=12,
                          start_P=0, seasonal=True, d=None, D=1, trace=True, 
                          error_action ='ignore',   # we don't want to know if an order does not work
                          suppress_warnings = True,  # we don't want convergence warnings
                          stepwise = True)           # set to stepwise
  
# To print the summary
stepwise_fit.summary()

# auto arima code
#model4 = stepwise_fit
#airline_train['ARIMA4'] = model4.fittedvalues()
#model4.resid().plot(kind='kde', color='C5', label='ARIMA4')
#plt.plot(model4.predict(two_decades), label='ARIMA4')

In [None]:
# trend parameter is used for intercept
model4 = ARIMA(airline_train['Passengers'], order=(0,1,1), seasonal_order=(2,1,1,12)).fit()
airline_train['ARIMA4'] = model4.predict(start=0, end=split, dynamic=False)
airline_test['ARIMA4'] = model4.predict(start=split, end=end, dynamic=True)

In [None]:
plt.title('ARIMA4')
plt.plot(airline_train['Passengers'], label='train')
plt.plot(airline_test['Passengers'], label='test')
plt.plot(airline_train['ARIMA4'], color='C5', label='ARIMA4')
plt.plot(airline_test['ARIMA4'], color='C5')
plt.legend()

## Compare models

In [None]:
alpha = 0.5
plt.title('ARIMA')
plt.plot(airline_train['Passengers'], alpha=alpha, label='train')
plt.plot(airline_test['Passengers'], alpha=alpha, label='test')
plt.plot(airline_train['ARIMA1'], color='C2', alpha=alpha, label='ARIMA1')
plt.plot(airline_test['ARIMA1'], color='C2', alpha=alpha)
plt.plot(airline_train['ARIMA2'], color='C3', alpha=alpha, label='ARIMA2')
plt.plot(airline_test['ARIMA2'], color='C3', alpha=alpha)
plt.plot(airline_train['ARIMA3'], color='C4', alpha=alpha, label='ARIMA3')
plt.plot(airline_test['ARIMA3'], color='C4', alpha=alpha)
plt.plot(airline_train['ARIMA4'], color='C5', alpha=alpha, label='ARIMA4')
plt.plot(airline_test['ARIMA4'], color='C5', alpha=alpha)
#plt.xlim(airline.index[108], airline.index[143])
plt.legend()
plt.tight_layout()

In [None]:
model1.resid.plot(kind='kde', color='C2', label='ARIMA1')
model2.resid.plot(kind='kde', color='C3', label='ARIMA2')
model3.resid.plot(kind='kde', color='C4', label='ARIMA3')
model4.resid.plot(kind='kde', color='C5', label='ARIMA4')
plt.legend()

In [None]:
two_decades = 240
plt.plot(airline_train['Passengers'], label='train')
plt.plot(airline_test['Passengers'], label='test')
plt.plot(model1.forecast(two_decades), label='ARIMA1')
plt.plot(model2.forecast(two_decades), label='ARIMA2')
plt.plot(model3.forecast(two_decades), label='ARIMA3')
plt.plot(model4.forecast(two_decades), label='ARIMA4')
plt.legend()

In [None]:
def calculate_metrics(models):
    aic = np.array([model.aic for model in models]).reshape(len(models),1)
    cols = airline_train.columns
    
    mse = []
    mae = []
    for col in cols[1:]:
        train_mse = np.mean((airline_train['Passengers'] - airline_train[col]) ** 2)
        test_mse = np.mean((airline_test['Passengers'] - airline_test[col]) ** 2)
        mse.append([train_mse, test_mse])
    
        train_mae = np.mean(np.abs(airline_train['Passengers'] - airline_train[col]))
        test_mae = np.mean(np.abs(airline_test['Passengers'] - airline_test[col]))
        mae.append([train_mae, test_mae])
    mse = np.array(mse)
    mae = np.array(mae)
    metrics = pd.DataFrame(np.hstack((aic, mse, mae)).T, columns=cols[1:],
                           index=['AIC', 'Train MSE', 'Test MSE', 'Train MAE', 'Test MAE'])

    return metrics

In [None]:
models = [model1, model2, model3, model4]

In [None]:
metrics = calculate_metrics(models)

In [None]:
metrics