In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import VECM, select_order, select_coint_rank
from time_series_utils import TSA
import matplotlib.pyplot as plt

In [2]:
start_date = pd.Timestamp("2000-01-01")
end_date = pd.Timestamp("2019-12-31")

In [3]:
# "SPY": SPDR S&P 500 ETF Trust
# "IWM": iShares Russell 2000 ETF 
sp500_df = yf.download("SPY", start=start_date, end=end_date)
russell2000_df = yf.download("IWM", start=start_date, end=end_date)
display(sp500_df.head())
display(russell2000_df.head())
sp500_df.shape[0], russell2000_df.shape[0]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03 00:00:00-05:00,148.25,148.25,143.875,145.4375,95.3088,8164300
2000-01-04 00:00:00-05:00,143.53125,144.0625,139.640625,139.75,91.581642,8089800
2000-01-05 00:00:00-05:00,139.9375,141.53125,137.25,140.0,91.745514,12177900
2000-01-06 00:00:00-05:00,139.625,141.5,137.75,137.75,90.270958,6227200
2000-01-07 00:00:00-05:00,140.3125,145.75,140.0625,145.75,95.513596,8066500


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-05-26 00:00:00-04:00,45.53125,45.71875,45.3125,45.71875,33.982506,74800
2000-05-30 00:00:00-04:00,46.375,47.40625,46.375,47.40625,35.236828,57600
2000-05-31 00:00:00-04:00,47.5625,48.1875,47.5625,47.578125,35.364574,36000
2000-06-01 00:00:00-04:00,48.554688,48.65625,48.554688,48.65625,36.165936,7000
2000-06-02 00:00:00-04:00,50.859375,51.1875,50.859375,51.1875,38.047409,29400


(5030, 4929)

In [4]:
sp500 = sp500_df["Adj Close"]
russell2000 = russell2000_df["Adj Close"]
pair_df = pd.concat([sp500, russell2000], axis=1, join="inner")
pair_df.columns = ["sp500", "russell2000"]
pair_df

Unnamed: 0_level_0,sp500,russell2000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-05-26 00:00:00-04:00,90.664650,33.982506
2000-05-30 00:00:00-04:00,93.621140,35.236828
2000-05-31 00:00:00-04:00,93.826416,35.364574
2000-06-01 00:00:00-04:00,95.468910,36.165936
2000-06-02 00:00:00-04:00,97.131927,38.047409
...,...,...
2019-12-23 00:00:00-05:00,306.215393,160.106979
2019-12-24 00:00:00-05:00,306.224945,160.472778
2019-12-26 00:00:00-05:00,307.855042,160.472778
2019-12-27 00:00:00-05:00,307.778778,159.683365


In [5]:
train_df = pair_df.loc[pair_df.index < "2017-01-01"].reset_index(drop=True)
test_df = pair_df.loc[pair_df.index >= "2017-01-01"].reset_index(drop=True)

In [6]:
train_df.corr()

Unnamed: 0,sp500,russell2000
sp500,1.0,0.973716
russell2000,0.973716,1.0


# VECM / VAR

$$\Delta y_t = \alpha \beta^T y_{t-1} + \Gamma_1 \Delta y_{t-1} + \dots + \Gamma_{p-1} \Delta y_{t-p+1} + u_t$$
where $\alpha, \beta \in \mathbb{R}^{K \times r}$ and $\Gamma_i \in \mathbb{R}^{K \times K}$ for $i = 1, \dots, p-1$ are the parameters and $u_t$ is $K$-dimensional white noise. Both $\alpha$ and $\beta$ have rank $r$ - then so called cointegration rank.

## Choose Deterministic Terms

**Logic:**  
sp500 and russell2000 are stock indices, so their movements are both triggered by their constitutent stock prices. With a regression coefficient $k$, residual $sp500 - k \times russell2000$, also error correction term, stands for excess profit of investing sp500 over russell2000. (Here, $k$ converts sp500 and russell2000 into the same scale.) Therefore, there is no need to add constand and or trend term inside cointegration relation.  

Outside cointegration relation, the financial meaning of VECM is clear. If we look at the first row of VECM, it shows that pnl of sp500 is influenced by the long-term excess profit of sp500 over russell2000 and the recent performance of sp500 and russell2000. If we expand our observation time horizon, as time goes by, the overall trend of stock market and also the constituents is going up due to various micro and macro factors, like companies' increasing profit and inflation. So, we need to add trend term outside cointegration relation. Without loss of generality, when trend term is added, we also add constant term.

This logic can also be applied to other stock indices.  

In [7]:
deterministic = "colo" # "n", "ci", "co", "li", "lo"

## Select Lag Order

In [8]:
# without `.reset_index(drop=True)`: warning - not use date index
lag_order = select_order(data=train_df, maxlags=10, deterministic=deterministic)
lag_order.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-1.552,-1.537*,0.2118,-1.547
1.0,-1.553,-1.532,0.2116,-1.545
2.0,-1.560,-1.532,0.2102,-1.550*
3.0,-1.560,-1.526,0.2102,-1.548
4.0,-1.560,-1.520,0.2102,-1.546
5.0,-1.560*,-1.514,0.2102*,-1.544
6.0,-1.559,-1.507,0.2103,-1.541
7.0,-1.557,-1.500,0.2107,-1.537
8.0,-1.557,-1.493,0.2108,-1.534


In [9]:
print(lag_order)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 5, BIC -> 0, FPE -> 5, HQIC ->  2>


In [10]:
lag_order.aic, lag_order.bic, lag_order.fpe, lag_order.hqic

(5, 0, 5, 2)

## Cointegration Rank

In [11]:
# `det_order=-1`: no deterministic terms
rank_test = select_coint_rank(train_df, det_order=-1, k_ar_diff=lag_order.aic, signif=0.05)
rank_test.rank

1

In [12]:
rank_test.summary()

r_0,r_1,test statistic,critical value
0,2,17.87,12.32
1,2,3.588,4.13


## Model

In [13]:
vecm = VECM(train_df, 
            k_ar_diff=lag_order.aic, 
            coint_rank=rank_test.rank,
            deterministic=deterministic)

In [14]:
vecm_res = vecm.fit()
vecm_res.summary()

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0555,0.038,-1.453,0.146,-0.130,0.019
lin_trend,5.737e-05,2.97e-05,1.933,0.053,-8.06e-07,0.000
L1.sp500,-0.0774,0.033,-2.318,0.020,-0.143,-0.012
L1.russell2000,0.0356,0.045,0.792,0.428,-0.052,0.124
L2.sp500,-0.1171,0.033,-3.509,0.000,-0.183,-0.052
L2.russell2000,0.0978,0.045,2.180,0.029,0.010,0.186
L3.sp500,-0.0140,0.033,-0.419,0.675,-0.080,0.052
L3.russell2000,0.0032,0.045,0.071,0.943,-0.085,0.091
L4.sp500,-0.0496,0.033,-1.487,0.137,-0.115,0.016

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.0330,0.028,-1.159,0.246,-0.089,0.023
lin_trend,6.12e-05,2.21e-05,2.772,0.006,1.79e-05,0.000
L1.sp500,-0.0384,0.025,-1.548,0.122,-0.087,0.010
L1.russell2000,0.0032,0.033,0.096,0.923,-0.062,0.069
L2.sp500,-0.0358,0.025,-1.444,0.149,-0.084,0.013
L2.russell2000,0.0379,0.033,1.136,0.256,-0.027,0.103
L3.sp500,0.0228,0.025,0.918,0.358,-0.026,0.072
L3.russell2000,-0.0291,0.033,-0.872,0.383,-0.094,0.036
L4.sp500,-0.0194,0.025,-0.783,0.434,-0.068,0.029

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0020,0.002,0.871,0.384,-0.002,0.006

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0040,0.002,2.392,0.017,0.001,0.007

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
beta.1,1.0000,0,0,0.000,1.000,1.000
beta.2,-1.9983,0.197,-10.168,0.000,-2.384,-1.613


## Forecast

In [15]:
vecm_res.predict(steps=5)

array([[201.43786946, 124.92737465],
       [201.56075434, 124.9720525 ],
       [201.72529237, 125.04108969],
       [201.80566743, 125.07575178],
       [201.90113027, 125.11661179]])

In [16]:
for text, nd in zip(
    ["forecast", "lower", "upper"], 
    vecm_res.predict(steps=5, alpha=0.05) # 95% confidence level
):
    print(f"{text}:\n{nd}")

forecast:
[[201.43786946 124.92737465]
 [201.56075434 124.9720525 ]
 [201.72529237 125.04108969]
 [201.80566743 125.07575178]
 [201.90113027 125.11661179]]
lower:
[[199.18588059 123.2526077 ]
 [198.46150928 122.65688253]
 [198.02701947 122.23267804]
 [197.59963434 121.85178355]
 [197.26094353 121.54605198]]
upper:
[[203.68985832 126.60214159]
 [204.65999941 127.28722247]
 [205.42356528 127.84950134]
 [206.01170053 128.29972001]
 [206.54131702 128.68717159]]
