In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import VECM, select_order, select_coint_rank
from time_series_utils import TSA
import matplotlib.pyplot as plt

In [2]:
start_date = pd.Timestamp("2000-01-01")
end_date = pd.Timestamp("2019-12-31")

In [3]:
# "SPY": SPDR S&P 500 ETF Trust
# "IWM": iShares Russell 2000 ETF 
# "QQQ": Invesco QQQ Trust (ETF based on the Nasdaq-100 Index)
# "DIA": SPDR Dow Jones Industrial Average ETF Trust
sp500_df = yf.download("SPY", start=start_date, end=end_date)
russell2000_df = yf.download("IWM", start=start_date, end=end_date)
nasdaq100_df = yf.download("QQQ", start=start_date, end=end_date)
dia_df = yf.download("DIA", start=start_date, end=end_date)

sp500_df.shape[0], russell2000_df.shape[0], nasdaq100_df.shape[0], dia_df.shape[0]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


(5030, 4929, 5030, 5030)

In [4]:
sp500 = sp500_df["Adj Close"]
russell2000 = russell2000_df["Adj Close"]
nasdaq100 = nasdaq100_df["Adj Close"]
dia = dia_df["Adj Close"]
basket_df = pd.concat([sp500, russell2000, nasdaq100, dia], axis=1, join="inner")
basket_df.columns = ["sp500", "russell2000", "nasdaq100", "dia"]
basket_df

Unnamed: 0_level_0,sp500,russell2000,nasdaq100,dia
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-05-26 00:00:00-04:00,90.664688,33.982513,66.853607,62.454620
2000-05-30 00:00:00-04:00,93.621094,35.236824,73.571198,63.855137
2000-05-31 00:00:00-04:00,93.826454,35.364571,71.475304,63.779469
2000-06-01 00:00:00-04:00,95.468918,36.165916,75.129677,64.498558
2000-06-02 00:00:00-04:00,97.131943,38.047401,80.503754,65.113701
...,...,...,...,...
2019-12-23 00:00:00-05:00,306.215393,160.106949,208.067749,269.091522
2019-12-24 00:00:00-05:00,306.224976,160.472794,208.175812,268.789795
2019-12-26 00:00:00-05:00,307.855011,160.472794,210.012756,269.779938
2019-12-27 00:00:00-05:00,307.778748,159.683334,209.835938,270.034607


In [5]:
train_df = basket_df.loc[basket_df.index < "2017-01-01"].reset_index(drop=True)
test_df = basket_df.loc[basket_df.index >= "2017-01-01"].reset_index(drop=True)

In [6]:
train_df.corr()

Unnamed: 0,sp500,russell2000,nasdaq100,dia
sp500,1.0,0.973716,0.956378,0.989335
russell2000,0.973716,1.0,0.893333,0.987343
nasdaq100,0.956378,0.893333,1.0,0.927874
dia,0.989335,0.987343,0.927874,1.0


# VECM / VAR

$$\Delta y_t = \alpha \beta^T y_{t-1} + \Gamma_1 \Delta y_{t-1} + \dots + \Gamma_{p-1} \Delta y_{t-p+1} + u_t$$
where $\alpha, \beta \in \mathbb{R}^{K \times r}$ and $\Gamma_i \in \mathbb{R}^{K \times K}$ for $i = 1, \dots, p-1$ are the parameters and $u_t$ is $K$-dimensional white noise. Both $\alpha$ and $\beta$ have rank $r$ - then so called cointegration rank.

## Choose Deterministic Terms

In [7]:
deterministic = "colo" # "n", "ci", "co", "li", "lo"

## Select Lag Order

In [8]:
# without `.reset_index(drop=True)`: warning - not use date index
lag_order = select_order(data=train_df, maxlags=10, deterministic=deterministic)
lag_order.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-6.018,-5.976*,0.002434,-6.003
1.0,-6.032,-5.965,0.002400,-6.009
2.0,-6.056,-5.965,0.002343,-6.024*
3.0,-6.058,-5.943,0.002339,-6.017
4.0,-6.059,-5.919,0.002337,-6.010
5.0,-6.062,-5.898,0.002330,-6.004
6.0,-6.061,-5.872,0.002332,-5.994
7.0,-6.060,-5.847,0.002333,-5.985
8.0,-6.066*,-5.828,0.002321*,-5.982


In [9]:
print(lag_order)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 8, BIC -> 0, FPE -> 8, HQIC ->  2>


In [10]:
lag_order.aic, lag_order.bic, lag_order.fpe, lag_order.hqic

(8, 0, 8, 2)

## Cointegration Rank

In [11]:
# `det_order=-1`: no deterministic terms
rank_test = select_coint_rank(train_df, det_order=-1, k_ar_diff=lag_order.aic, signif=0.05)
rank_test.rank

1

In [12]:
rank_test.summary()

r_0,r_1,test statistic,critical value
0,4,51.69,40.17
1,4,18.02,24.28


## Model

In [13]:
vecm = VECM(train_df, 
            k_ar_diff=lag_order.aic, 
            coint_rank=rank_test.rank,
            deterministic=deterministic)

In [14]:
vecm_res = vecm.fit()
vecm_res.summary()

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0689,0.051,-1.355,0.175,-0.169,0.031
lin_trend,3.709e-05,1.53e-05,2.420,0.016,7.05e-06,6.71e-05
L1.sp500,-0.1921,0.086,-2.228,0.026,-0.361,-0.023
L1.russell2000,0.0319,0.045,0.703,0.482,-0.057,0.121
L1.nasdaq100,0.1249,0.041,3.057,0.002,0.045,0.205
L1.dia,0.0577,0.081,0.715,0.475,-0.100,0.216
L2.sp500,-0.2727,0.087,-3.147,0.002,-0.443,-0.103
L2.russell2000,0.1026,0.045,2.263,0.024,0.014,0.191
L2.nasdaq100,0.0495,0.041,1.201,0.230,-0.031,0.130

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0359,0.038,-0.946,0.344,-0.110,0.038
lin_trend,1.85e-05,1.14e-05,1.620,0.105,-3.88e-06,4.09e-05
L1.sp500,-0.1060,0.064,-1.650,0.099,-0.232,0.020
L1.russell2000,-0.0006,0.034,-0.018,0.986,-0.067,0.066
L1.nasdaq100,0.0533,0.030,1.752,0.080,-0.006,0.113
L1.dia,0.0503,0.060,0.836,0.403,-0.068,0.168
L2.sp500,-0.1022,0.065,-1.582,0.114,-0.229,0.024
L2.russell2000,0.0367,0.034,1.088,0.277,-0.029,0.103
L2.nasdaq100,0.0121,0.031,0.393,0.694,-0.048,0.072

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1573,0.036,-4.401,0.000,-0.227,-0.087
lin_trend,4.605e-05,1.08e-05,4.277,0.000,2.49e-05,6.72e-05
L1.sp500,0.0656,0.061,1.083,0.279,-0.053,0.184
L1.russell2000,-0.0112,0.032,-0.352,0.725,-0.074,0.051
L1.nasdaq100,-0.0181,0.029,-0.630,0.528,-0.074,0.038
L1.dia,-0.0893,0.057,-1.576,0.115,-0.200,0.022
L2.sp500,-0.0086,0.061,-0.142,0.887,-0.128,0.111
L2.russell2000,0.0393,0.032,1.234,0.217,-0.023,0.102
L2.nasdaq100,-0.1086,0.029,-3.755,0.000,-0.165,-0.052

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0117,0.041,-0.283,0.777,-0.093,0.069
lin_trend,2.208e-05,1.25e-05,1.772,0.076,-2.34e-06,4.65e-05
L1.sp500,-0.1055,0.070,-1.506,0.132,-0.243,0.032
L1.russell2000,0.0295,0.037,0.802,0.423,-0.043,0.102
L1.nasdaq100,0.0697,0.033,2.098,0.036,0.005,0.135
L1.dia,-0.0001,0.066,-0.002,0.999,-0.129,0.128
L2.sp500,-0.2094,0.070,-2.973,0.003,-0.347,-0.071
L2.russell2000,0.0899,0.037,2.442,0.015,0.018,0.162
L2.nasdaq100,0.0632,0.033,1.889,0.059,-0.002,0.129

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0005,0.001,0.674,0.500,-0.001,0.002

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0004,0.001,0.855,0.392,-0.001,0.001

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0017,0.000,3.473,0.001,0.001,0.003

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,-0.0001,0.001,-0.242,0.809,-0.001,0.001

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
beta.1,1.0000,0,0,0.000,1.000,1.000
beta.2,-1.9022,1.350,-1.409,0.159,-4.549,0.744
beta.3,-3.1687,0.584,-5.424,0.000,-4.314,-2.024
beta.4,2.3765,1.291,1.841,0.066,-0.154,4.907


## Forecast

In [15]:
vecm_res.predict(steps=5)

array([[201.35310482, 124.97275755, 113.54240664, 174.44709431],
       [201.58266053, 125.12488186, 113.80341958, 174.61102993],
       [201.88182794, 125.20962787, 113.94144945, 174.85806521],
       [201.92816963, 125.23686433, 113.92830854, 174.89118386],
       [201.97445986, 125.28717312, 113.98500178, 174.92332352]])

In [16]:
for text, nd in zip(
    ["forecast", "lower", "upper"], 
    vecm_res.predict(steps=5, alpha=0.05) # 95% confidence level
):
    print(f"{text}:\n{nd}")

forecast:
[[201.35310482 124.97275755 113.54240664 174.44709431]
 [201.58266053 125.12488186 113.80341958 174.61102993]
 [201.88182794 125.20962787 113.94144945 174.85806521]
 [201.92816963 125.23686433 113.92830854 174.89118386]
 [201.97445986 125.28717312 113.98500178 174.92332352]]
lower:
[[199.10968555 123.30123721 111.9664672  172.62374477]
 [198.4937807  122.81270875 111.60525361 172.106833  ]
 [198.19262598 122.40350651 111.33281839 171.85122657]
 [197.72920177 122.01326985 110.98478672 171.43906229]
 [197.34047919 121.71386416 110.73526202 171.09659641]]
upper:
[[203.59652408 126.64427789 115.11834607 176.27044386]
 [204.67154036 127.43705497 116.00158555 177.11522686]
 [205.5710299  128.01574923 116.55008051 177.86490386]
 [206.12713749 128.46045881 116.87183035 178.34330542]
 [206.60844053 128.86048209 117.23474155 178.75005063]]
