In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import VECM, select_order, select_coint_rank, coint_johansen
from time_series_utils import TSA
import matplotlib.pyplot as plt

In [2]:
start_date = pd.Timestamp("2000-01-01")
end_date = pd.Timestamp("2019-12-31")

In [3]:
# "SPY": SPDR S&P 500 ETF Trust
# "IWM": iShares Russell 2000 ETF 
sp500_df = yf.download("SPY", start=start_date, end=end_date)
russell2000_df = yf.download("IWM", start=start_date, end=end_date)
display(sp500_df.head())
display(russell2000_df.head())
sp500_df.shape[0], russell2000_df.shape[0]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03 00:00:00-05:00,148.25,148.25,143.875,145.4375,95.308784,8164300
2000-01-04 00:00:00-05:00,143.53125,144.0625,139.640625,139.75,91.581642,8089800
2000-01-05 00:00:00-05:00,139.9375,141.53125,137.25,140.0,91.745483,12177900
2000-01-06 00:00:00-05:00,139.625,141.5,137.75,137.75,90.271011,6227200
2000-01-07 00:00:00-05:00,140.3125,145.75,140.0625,145.75,95.513603,8066500


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-05-26 00:00:00-04:00,45.53125,45.71875,45.3125,45.71875,33.982517,74800
2000-05-30 00:00:00-04:00,46.375,47.40625,46.375,47.40625,35.236813,57600
2000-05-31 00:00:00-04:00,47.5625,48.1875,47.5625,47.578125,35.364586,36000
2000-06-01 00:00:00-04:00,48.554688,48.65625,48.554688,48.65625,36.165955,7000
2000-06-02 00:00:00-04:00,50.859375,51.1875,50.859375,51.1875,38.047401,29400


(5030, 4929)

In [4]:
sp500 = sp500_df["Adj Close"]
russell2000 = russell2000_df["Adj Close"]
pair_df = pd.concat([sp500, russell2000], axis=1, join="inner")
pair_df.columns = ["sp500", "russell2000"]
pair_df

Unnamed: 0_level_0,sp500,russell2000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-05-26 00:00:00-04:00,90.664650,33.982517
2000-05-30 00:00:00-04:00,93.621147,35.236813
2000-05-31 00:00:00-04:00,93.826439,35.364586
2000-06-01 00:00:00-04:00,95.468918,36.165955
2000-06-02 00:00:00-04:00,97.131912,38.047401
...,...,...
2019-12-23 00:00:00-05:00,306.215424,160.106979
2019-12-24 00:00:00-05:00,306.224884,160.472794
2019-12-26 00:00:00-05:00,307.855072,160.472794
2019-12-27 00:00:00-05:00,307.778778,159.683350


In [5]:
train_df = pair_df.loc[pair_df.index < "2017-01-01"].reset_index(drop=True)
test_df = pair_df.loc[pair_df.index >= "2017-01-01"].reset_index(drop=True)

# VECM / VAR (SP500 & Russell2000)

$$\Delta y_t = \alpha \beta^T y_{t-1} + \Gamma_1 \Delta y_{t-1} + \dots + \Gamma_{p-1} \Delta y_{t-p+1} + u_t$$
where $\alpha, \beta \in \mathbb{R}^{K \times r}$ and $\Gamma_i \in \mathbb{R}^{K \times K}$ for $i = 1, \dots, p-1$ are the parameters and $u_t$ is $K$-dimensional white noise. Both $\alpha$ and $\beta$ have rank $r$ - then so called cointegration rank.

## Choose Deterministic Terms

**Logic:**  
sp500 and russell2000 are stock indices, so their movements are both triggered by their constitutent stock prices. With a regression coefficient $k$, residual $sp500 - k \times russell2000$, also error correction term, stands for excess profit of investing sp500 over russell2000. (Here, $k$ converts sp500 and russell2000 into the same scale.) Therefore, there is no need to add constand and or trend term inside cointegration relation.  

Outside cointegration relation, the financial meaning of VECM is clear. If we look at the first row of VECM, it shows that pnl of sp500 is influenced by the long-term excess profit of sp500 over russell2000 and the recent performance of sp500 and russell2000. So, there is no need to add constant or trend term outside cointegration relation.

This logic can also be applied to other stock indices.  

In [6]:
deterministic = "n" # "n", "ci", "co", "li", "lo"

## Select Lag Order

In [7]:
# without `.reset_index(drop=True)`: warning - not use date index
lag_order = select_order(data=train_df, maxlags=10, deterministic=deterministic)
lag_order.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-1.551,-1.542*,0.2119,-1.548
1.0,-1.553,-1.537,0.2117,-1.547
2.0,-1.559,-1.538,0.2103,-1.552*
3.0,-1.559,-1.532,0.2103,-1.550
4.0,-1.559,-1.526,0.2103,-1.548
5.0,-1.560*,-1.520,0.2102*,-1.546
6.0,-1.559,-1.513,0.2104,-1.543
7.0,-1.557,-1.506,0.2107,-1.539
8.0,-1.557,-1.499,0.2108,-1.536


In [8]:
print(lag_order)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 5, BIC -> 0, FPE -> 5, HQIC ->  2>


In [9]:
lag_order.aic, lag_order.bic, lag_order.fpe, lag_order.hqic

(5, 0, 5, 2)

## Cointegration Rank

In [10]:
# `det_order=-1`: no deterministic terms
rank_test = select_coint_rank(train_df, det_order=-1, k_ar_diff=lag_order.aic, signif=0.05)
rank_test.rank

1

In [11]:
rank_test.summary()

r_0,r_1,test statistic,critical value
0,2,17.87,12.32
1,2,3.588,4.13


## Model

In [12]:
vecm = VECM(train_df, 
            k_ar_diff=lag_order.aic, 
            coint_rank=rank_test.rank,
            deterministic=deterministic)

In [13]:
vecm_res = vecm.fit()
vecm_res.summary()

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
L1.sp500,-0.0739,0.033,-2.216,0.027,-0.139,-0.009
L1.russell2000,0.0314,0.045,0.699,0.484,-0.057,0.119
L2.sp500,-0.1137,0.033,-3.408,0.001,-0.179,-0.048
L2.russell2000,0.0937,0.045,2.091,0.037,0.006,0.182
L3.sp500,-0.0107,0.033,-0.319,0.750,-0.076,0.055
L3.russell2000,-0.0007,0.045,-0.016,0.987,-0.089,0.087
L4.sp500,-0.0466,0.033,-1.397,0.162,-0.112,0.019
L4.russell2000,0.0332,0.045,0.742,0.458,-0.054,0.121
L5.sp500,-0.0007,0.033,-0.020,0.984,-0.066,0.065

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
L1.sp500,-0.0354,0.025,-1.427,0.154,-0.084,0.013
L1.russell2000,-0.0008,0.033,-0.025,0.980,-0.066,0.065
L2.sp500,-0.0329,0.025,-1.323,0.186,-0.082,0.016
L2.russell2000,0.0340,0.033,1.019,0.308,-0.031,0.099
L3.sp500,0.0258,0.025,1.037,0.300,-0.023,0.075
L3.russell2000,-0.0329,0.033,-0.986,0.324,-0.098,0.032
L4.sp500,-0.0167,0.025,-0.675,0.500,-0.065,0.032
L4.russell2000,-0.0081,0.033,-0.243,0.808,-0.073,0.057
L5.sp500,0.0043,0.025,0.175,0.861,-0.044,0.053

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,-0.0026,0.002,-1.506,0.132,-0.006,0.001

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0003,0.001,0.272,0.785,-0.002,0.003

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
beta.1,1.0000,0,0,0.000,1.000,1.000
beta.2,-1.7116,0.041,-41.363,0.000,-1.793,-1.630


## Forecast

In [14]:
vecm_res.predict(steps=5)

array([[201.37700974, 124.89139803],
       [201.4437947 , 124.90378792],
       [201.55596951, 124.9417114 ],
       [201.58487933, 124.94485203],
       [201.63053241, 124.95585159]])

In [15]:
for text, nd in zip(
    ["forecast", "lower", "upper"], 
    vecm_res.predict(steps=5, alpha=0.05) # 95% confidence level
):
    print(f"{text}:\n{nd}")

forecast:
[[201.37700974 124.89139803]
 [201.4437947  124.90378792]
 [201.55596951 124.9417114 ]
 [201.58487933 124.94485203]
 [201.63053241 124.95585159]]
lower:
[[199.12320046 123.21447279]
 [198.33948421 122.5826116 ]
 [197.84844053 122.12231172]
 [197.36462885 121.70398766]
 [196.97056885 121.36182567]]
upper:
[[203.63081901 126.56832327]
 [204.5481052  127.22496423]
 [205.2634985  127.76111108]
 [205.80512982 128.1857164 ]
 [206.29049598 128.54987751]]
