# 07.08 - SARIMAX AutoARIMA - 5 Years Training Data

## Imports & setup

In [1]:
import pathlib
import warnings
from datetime import datetime
import sys
import pickle
import joblib
import gc

import pandas as pd
import numpy as np

# Plotting
import matplotlib as mpl
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
import palettable
import seaborn as sns

# Imports
sys.path.append("..")
from src.utils.utils import (AnnualTimeSeriesSplit,
                             bound_precision,
                             run_cross_val,
                             run_data_split_cross_val,
                             save_run_results)
from src.features.features import CyclicalToCycle
from src.models.models import SK_SARIMAX
from src.visualization.visualize import (plot_prediction,
                                         plot_joint_plot,
                                         residual_plots,
                                         print_residual_stats)
#b # Packages
from sklearn.pipeline import Pipeline
from skoot.feature_selection import FeatureFilter
from skoot.preprocessing import SelectiveRobustScaler
from sklearn.metrics import mean_absolute_error
from scipy.stats import norm
from statsmodels.graphics.gofplots import qqplot
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf
import statsmodels.api as sm
import pmdarima as pm



inline_rc = dict(mpl.rcParams)


# Display
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
figsize=(15,7)
warnings.filterwarnings(action='ignore')
%matplotlib inline

# Data
PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' / '05-clean'



The examples.directory rcparam was deprecated in Matplotlib 3.0 and will be removed in 3.2. In the future, examples will be found relative to the 'datapath' directory.



### Auto ARIMA

we can use AutoArima to look for the best values of p, d, q, P, D, and Q

At this point, we will throw in all the exogenous variables without any pre-processing. We can fine tune this later, after we have decided on the ARIMA parameters

Auto Arima can only handle 3 years at a time on my computer before crashing with a memory error. Therefore, we will run x batches of 3 years, and review the results in order to select the ARIMA parameters

## Load Daily Data & Inspect

In [2]:
df = pd.read_csv(CLEAN_DATA_DIR / 'clean-features.csv', parse_dates=True, index_col=0)

In [3]:
X = df.loc['2004': '2008']
y = X.pop('daily_peak')

In [4]:
X.head()

Unnamed: 0,hmdxx_min,hmdxx_max,hmdxx_median-1,hmdxx_max_hour,temp_min,temp_max,dew_point_temp_max,sun_rise,sun_set,visibility_mean,day_of_week,week_of_year,day_type
2004-05-17,7.735889,20.970716,12.375721,17.0,7.8,18.5,13.3,6.0,21.0,19.7,0.0,21.0,0
2004-05-18,11.932519,26.74366,16.712871,14.0,11.4,22.2,17.4,6.0,21.0,17.429167,1.0,21.0,0
2004-05-19,9.227487,18.550839,23.478914,13.0,8.9,18.5,9.5,6.0,21.0,20.366667,2.0,21.0,0
2004-05-20,9.012192,31.353519,13.901348,17.0,10.3,23.8,20.1,6.0,21.0,15.35,3.0,21.0,0
2004-05-21,14.808592,24.557386,20.764855,0.0,13.2,19.2,17.1,6.0,21.0,20.966667,4.0,21.0,1


In [5]:
y.tail()

2008-09-29    18338.0
2008-09-30    17977.0
2008-10-01    18168.0
2008-10-02    18256.0
2008-10-03    17495.0
Name: daily_peak, dtype: float64

## Run Auto ARIMA

In [6]:
# Set up Feature Transformers
cycle0 = CyclicalToCycle('hmdxx_max_hour', 24)
cycle1 = CyclicalToCycle('sun_rise', 24)
cycle2 = CyclicalToCycle('sun_set', 24)
cycle3 = CyclicalToCycle('day_of_week', 5)
cycle4 = CyclicalToCycle('week_of_year', 20)

robust_scaler_cols = ['hmdxx_min', 'hmdxx_max', 'hmdxx_median-1', 'temp_min', 'temp_max',
                      'dew_point_temp_max', 'visibility_mean']

rscaler = SelectiveRobustScaler(cols=robust_scaler_cols, trans_col_name=robust_scaler_cols)


cycle0.fit(X, y)
X_t = cycle0.transform(X)

cycle1.fit(X_t, y)
X_t = cycle1.transform(X_t)

cycle2.fit(X_t, y)
X_t = cycle2.transform(X_t)

cycle3.fit(X_t, y)
X_t = cycle3.transform(X_t)

cycle4.fit(X_t, y)
X_t = cycle4.transform(X_t)

rscaler.fit(X_t, y)
X_t = rscaler.transform(X_t)

# Stepwise Fit
model = pm.auto_arima(y,
                      seasonal=True,
                      m=96,
                      trace=True,
                      trend='c',
                      exog=X_t,
                      error_action='ignore',
                      suppress_warnings=True)

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


Fit ARIMA: order=(2, 1, 2) seasonal_order=(1, 0, 1, 96); AIC=8275.806, BIC=8309.179, Fit time=235.733 seconds
Fit ARIMA: order=(0, 1, 0) seasonal_order=(0, 0, 0, 96); AIC=8330.977, BIC=8339.321, Fit time=0.020 seconds
Fit ARIMA: order=(1, 1, 0) seasonal_order=(1, 0, 0, 96); AIC=8331.396, BIC=8348.083, Fit time=22.633 seconds
Fit ARIMA: order=(0, 1, 1) seasonal_order=(0, 0, 1, 96); AIC=8330.911, BIC=8347.598, Fit time=15.204 seconds
Fit ARIMA: order=(2, 1, 2) seasonal_order=(0, 0, 1, 96); AIC=8276.142, BIC=8305.344, Fit time=108.742 seconds
Fit ARIMA: order=(2, 1, 2) seasonal_order=(2, 0, 1, 96); AIC=8277.849, BIC=8315.394, Fit time=798.965 seconds
Fit ARIMA: order=(2, 1, 2) seasonal_order=(1, 0, 0, 96); AIC=8276.287, BIC=8305.489, Fit time=86.259 seconds
Fit ARIMA: order=(2, 1, 2) seasonal_order=(1, 0, 2, 96); AIC=nan, BIC=nan, Fit time=nan seconds
Fit ARIMA: order=(2, 1, 2) seasonal_order=(0, 0, 0, 96); AIC=8278.078, BIC=8303.108, Fit time=0.856 seconds
Fit ARIMA: order=(2, 1, 2) seas

In [7]:
model.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,480.0
Model:,"SARIMAX(2, 1, 2)x(1, 0, 1, 96)",Log Likelihood,-4129.903
Date:,"Fri, 11 Oct 2019",AIC,8275.806
Time:,20:19:19,BIC,8309.179
Sample:,0,HQIC,8288.925
,- 480,,
Covariance Type:,opg,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,4.7038,13.385,0.351,0.725,-21.530,30.937
ar.L1,1.0868,0.230,4.716,0.000,0.635,1.538
ar.L2,-0.4105,0.153,-2.680,0.007,-0.711,-0.110
ma.L1,-1.2585,0.235,-5.358,0.000,-1.719,-0.798
ma.L2,0.3549,0.218,1.630,0.103,-0.072,0.782
ar.S.L96,-0.7550,0.448,-1.685,0.092,-1.633,0.123
ma.S.L96,0.8578,0.472,1.816,0.069,-0.068,1.784
sigma2,1.877e+06,1.5e+05,12.499,0.000,1.58e+06,2.17e+06

0,1,2,3
Ljung-Box (Q):,32.13,Jarque-Bera (JB):,29.73
Prob(Q):,0.81,Prob(JB):,0.0
Heteroskedasticity (H):,1.13,Skew:,0.4
Prob(H) (two-sided):,0.43,Kurtosis:,3.93


In [8]:
model.get_params()

{'callback': None,
 'disp': 0,
 'maxiter': None,
 'method': None,
 'order': (2, 1, 2),
 'out_of_sample_size': 0,
 'scoring': 'mse',
 'scoring_args': {},
 'seasonal_order': (1, 0, 1, 96),
 'solver': 'lbfgs',
 'start_params': None,
 'transparams': True,
 'trend': 'c',
 'with_intercept': True}

In [9]:
X_t.head()

Unnamed: 0,day_type,sin_hmdxx_max_hour,cos_hmdxx_max_hour,sin_sun_rise,cos_sun_rise,sin_sun_set,cos_sun_set,sin_day_of_week,cos_day_of_week,sin_week_of_year,cos_week_of_year,hmdxx_min,hmdxx_max,hmdxx_median-1,temp_min,temp_max,dew_point_temp_max,visibility_mean
2004-05-17,0,-0.965926,-0.258819,1.0,6.123234000000001e-17,-0.707107,0.707107,0.0,1.0,0.309017,0.951057,-1.229892,-0.818233,-1.420395,-1.470588,-1.004149,-0.534562,-0.983333
2004-05-18,0,-0.5,-0.866025,1.0,6.123234000000001e-17,-0.707107,0.707107,0.951057,0.309017,0.309017,0.951057,-0.697521,-0.161888,-0.87095,-0.764706,-0.390041,0.221198,-1.551042
2004-05-19,0,-0.258819,-0.965926,1.0,6.123234000000001e-17,-0.707107,0.707107,0.587785,-0.809017,0.309017,0.951057,-1.040673,-1.093357,-0.013805,-1.254902,-1.004149,-1.235023,-0.816667
2004-05-20,0,-0.965926,-0.258819,1.0,6.123234000000001e-17,-0.707107,0.707107,-0.587785,-0.809017,0.309017,0.951057,-1.067984,0.362222,-1.227123,-0.980392,-0.124481,0.718894,-2.070833
2004-05-21,1,0.0,1.0,1.0,6.123234000000001e-17,-0.707107,0.707107,-0.951057,0.309017,0.309017,0.951057,-0.332672,-0.410453,-0.357631,-0.411765,-0.887967,0.165899,-0.666667


In [10]:
X_t.tail()

Unnamed: 0,day_type,sin_hmdxx_max_hour,cos_hmdxx_max_hour,sin_sun_rise,cos_sun_rise,sin_sun_set,cos_sun_set,sin_day_of_week,cos_day_of_week,sin_week_of_year,cos_week_of_year,hmdxx_min,hmdxx_max,hmdxx_median-1,temp_min,temp_max,dew_point_temp_max,visibility_mean
2008-09-29,0,-0.258819,0.965926,0.866025,-0.5,-0.866025,0.5,0.0,1.0,-4.898587e-16,1.0,-0.648835,-1.438759,-0.401394,-0.705882,-1.701245,-1.050691,0.116667
2008-09-30,0,-0.866025,-0.5,0.866025,-0.5,-0.866025,0.5,0.951057,0.309017,-4.898587e-16,1.0,-0.332672,-1.001712,-1.234814,-0.431373,-1.103734,-0.35023,-1.188542
2008-10-01,0,0.258819,-0.965926,0.866025,-0.5,-0.965926,0.258819,0.587785,-0.809017,-4.898587e-16,1.0,-1.227409,-1.531638,-0.972369,-1.313725,-1.817427,-1.032258,0.116667
2008-10-02,0,-0.5,-0.866025,0.866025,-0.5,-0.965926,0.258819,-0.587785,-0.809017,-4.898587e-16,1.0,-1.476408,-1.836744,-1.428985,-1.666667,-1.983402,-1.769585,-0.134375
2008-10-03,1,-0.707107,-0.707107,0.866025,-0.5,-0.965926,0.258819,-0.951057,0.309017,-4.898587e-16,1.0,-1.841162,-1.974133,-2.035206,-2.058824,-2.082988,-2.175115,0.116667


There is not too much difference between the best models, so we will choose a variety from these to try out in our cross validation scheme:

+ SARIMAX I (2,1,2) (1,0,1,96)
+ SARIMAX II (2,1,2) (0,0,1,96)
+ SARIMAX III (2,1,2) (1,0,0,96)
+ SARIMAX IV (1,0,1) (1,0,0,96) - Made this one up to try as a reference