In [1]:
import sys
print(sys.executable)

c:\Users\basil\Anaconda\python.exe


In [2]:
import sys
sys.path.append('../modules')  # go up one folder, then into "modules"

import utils
utils.configure_plotly_template(showlegend=True)

In [4]:
import pandas as pd
df = pd.read_parquet('../statsmodel/AirPassengers.parquet').asfreq("ME")
df.columns = ['values']
df

Unnamed: 0,values
1949-01-31,112
1949-02-28,118
1949-03-31,132
1949-04-30,129
1949-05-31,121
...,...
1960-08-31,606
1960-09-30,508
1960-10-31,461
1960-11-30,390


#### Achieve Stationarity

| **Property**             | **Issue**           | **Fix**               | **Test** |
| ------------------------ | ------------------- | --------------------- | -------- |
| Stationarity in variance | Changing volatility | Log transform / GARCH | ARCH     |
| Stationarity in mean     | Trend / Drift       | Differencing          | ADF      |


#### Variance

In [5]:
df['values'].plot()

In [6]:
import numpy as np
df['values_log'] = np.log(df['values'])
df

Unnamed: 0,values,values_log
1949-01-31,112,4.718499
1949-02-28,118,4.770685
1949-03-31,132,4.882802
1949-04-30,129,4.859812
1949-05-31,121,4.795791
...,...,...
1960-08-31,606,6.406880
1960-09-30,508,6.230481
1960-10-31,461,6.133398
1960-11-30,390,5.966147


In [9]:

fig = df.plot(facet_col='variable')
fig.update_yaxes(matches=None)

#### Mean

In [10]:
from statsmodels.tsa.stattools import adfuller
adfuller(df['values_log'])

(np.float64(-1.7170170891069636),
 np.float64(0.4223667747703897),
 13,
 130,
 {'1%': np.float64(-3.4816817173418295),
  '5%': np.float64(-2.8840418343195267),
  '10%': np.float64(-2.578770059171598)},
 np.float64(-445.3990312497209))

In [12]:
df['values_log_diff'] = df['values_log'].diff()
df.dropna(inplace=True)
df

Unnamed: 0,values,values_log,values_log_diff
1949-02-28,118,4.770685,0.052186
1949-03-31,132,4.882802,0.112117
1949-04-30,129,4.859812,-0.022990
1949-05-31,121,4.795791,-0.064022
1949-06-30,135,4.905275,0.109484
...,...,...,...
1960-08-31,606,6.406880,-0.026060
1960-09-30,508,6.230481,-0.176399
1960-10-31,461,6.133398,-0.097083
1960-11-30,390,5.966147,-0.167251


In [13]:
adfuller(df['values_log_diff'])

(np.float64(-2.7171305983881675),
 np.float64(0.07112054815085295),
 14,
 128,
 {'1%': np.float64(-3.4825006939887997),
  '5%': np.float64(-2.884397984161377),
  '10%': np.float64(-2.578960197753906)},
 np.float64(-440.35846985568105))

In [14]:
fig = df.plot(facet_col='variable')
fig.update_yaxes(matches=None)

#### Modelling

In [17]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
model = SARIMAX(df['values_log'], order=(1,1,1), seasonal_order=(0,1,1,12))
model_fit = model.fit()

In [19]:
forecast = model_fit.forecast(steps=48)
forecast

1961-01-31    6.111096
1961-02-28    6.056214
1961-03-31    6.174754
1961-04-30    6.201902
1961-05-31    6.235112
1961-06-30    6.371459
1961-07-31    6.509788
1961-08-31    6.505481
1961-09-30    6.327438
1961-10-31    6.211594
1961-11-30    6.066209
1961-12-31    6.170809
1962-01-31    6.209571
1962-02-28    6.153915
1962-03-31    6.272303
1962-04-30    6.299420
1962-05-31    6.332625
1962-06-30    6.468970
1962-07-31    6.607299
1962-08-31    6.602992
1962-09-30    6.424949
1962-10-31    6.309105
1962-11-30    6.163719
1962-12-31    6.268320
1963-01-31    6.307082
1963-02-28    6.251426
1963-03-31    6.369813
1963-04-30    6.396931
1963-05-31    6.430135
1963-06-30    6.566481
1963-07-31    6.704810
1963-08-31    6.700503
1963-09-30    6.522459
1963-10-31    6.406616
1963-11-30    6.261230
1963-12-31    6.365830
1964-01-31    6.404592
1964-02-29    6.348937
1964-03-31    6.467324
1964-04-30    6.494442
1964-05-31    6.527646
1964-06-30    6.663992
1964-07-31    6.802320
1964-08-31 

In [20]:
df_forecast = pd.DataFrame({
    'historical': df['values'],
    'forecast': np.exp(forecast)
})
df_forecast

Unnamed: 0,historical,forecast
1949-02-28,118.0,
1949-03-31,132.0,
1949-04-30,129.0,
1949-05-31,121.0,
1949-06-30,135.0,
...,...,...
1964-08-31,,896.065647
1964-09-30,,749.922702
1964-10-31,,667.891975
1964-11-30,,577.518755


In [21]:
df_forecast.plot()