In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import VECM, select_order, select_coint_rank
from time_series_utils import TSA
import matplotlib.pyplot as plt

In [2]:
start_date = pd.Timestamp("2000-01-01")
end_date = pd.Timestamp("2019-12-31")

In [3]:
# "SPY": SPDR S&P 500 ETF Trust
# "IWM": iShares Russell 2000 ETF 
sp500_df = yf.download("SPY", start=start_date, end=end_date)
russell2000_df = yf.download("IWM", start=start_date, end=end_date)
display(sp500_df.head())
display(russell2000_df.head())
sp500_df.shape[0], russell2000_df.shape[0]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03 00:00:00-05:00,148.25,148.25,143.875,145.4375,95.3088,8164300
2000-01-04 00:00:00-05:00,143.53125,144.0625,139.640625,139.75,91.58165,8089800
2000-01-05 00:00:00-05:00,139.9375,141.53125,137.25,140.0,91.745491,12177900
2000-01-06 00:00:00-05:00,139.625,141.5,137.75,137.75,90.271011,6227200
2000-01-07 00:00:00-05:00,140.3125,145.75,140.0625,145.75,95.513634,8066500


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-05-26 00:00:00-04:00,45.53125,45.71875,45.3125,45.71875,33.982517,74800
2000-05-30 00:00:00-04:00,46.375,47.40625,46.375,47.40625,35.236828,57600
2000-05-31 00:00:00-04:00,47.5625,48.1875,47.5625,47.578125,35.364574,36000
2000-06-01 00:00:00-04:00,48.554688,48.65625,48.554688,48.65625,36.165951,7000
2000-06-02 00:00:00-04:00,50.859375,51.1875,50.859375,51.1875,38.047409,29400


(5030, 4929)

In [4]:
sp500 = sp500_df["Adj Close"]
russell2000 = russell2000_df["Adj Close"]
pair_df = pd.concat([sp500, russell2000], axis=1, join="inner")
pair_df.columns = ["sp500", "russell2000"]
pair_df

Unnamed: 0_level_0,sp500,russell2000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-05-26 00:00:00-04:00,90.664680,33.982517
2000-05-30 00:00:00-04:00,93.621147,35.236828
2000-05-31 00:00:00-04:00,93.826431,35.364574
2000-06-01 00:00:00-04:00,95.468903,36.165951
2000-06-02 00:00:00-04:00,97.131958,38.047409
...,...,...
2019-12-23 00:00:00-05:00,306.215393,160.106918
2019-12-24 00:00:00-05:00,306.225006,160.472794
2019-12-26 00:00:00-05:00,307.855103,160.472794
2019-12-27 00:00:00-05:00,307.778778,159.683334


In [5]:
train_df = pair_df.loc[pair_df.index < "2017-01-01"]
test_df = pair_df.loc[pair_df.index >= "2017-01-01"]

In [6]:
train_df_ = train_df.reset_index(drop=True)

# Stationarity Test

In [7]:
sp500_tsa = TSA(train_df_.sp500.values)
ADF_stats, ADF_p_value, best_lag, terms = sp500_tsa.ADF_test_complete()
ADF_stats, ADF_p_value, best_lag, terms

(1.9040112761652597, 0.9873091437053468, 5, 'n')

In [8]:
russell2000_tsa = TSA(train_df_.russell2000.values)
ADF_stats, ADF_p_value, best_lag, terms = russell2000_tsa.ADF_test_complete()
ADF_stats, ADF_p_value, best_lag, terms

(1.7672407460561346, 0.9820028300411496, 5, 'n')

Both sp500 and russell2000 are unstationary.

# Cointegration Test

In [9]:
ols = sm.OLS(train_df_.sp500.values, train_df_.russell2000.values)
est = ols.fit()
residue = est.resid
residue_tsa = TSA(residue)
ADF_stats, ADF_p_value, best_lag, terms = residue_tsa.ADF_test_complete()
ADF_stats, ADF_p_value, best_lag, terms

(-3.2068729784290153, 0.0013460146676103292, 4, 'n')

Residue is stationary. So sp500 and russell2000 are cointegrated.

In [10]:
# add-ons: Johansen Test

# VAR / VECM

$$\Delta y_t = \alpha \beta^T y_{t-1} + \Gamma_1 \Delta y_{t-1} + \dots + \Gamma_{p-1} \Delta y_{t-p+1} + u_t$$
where $\alpha, \beta \in \mathbb{R}^{K \times r}$ and $\Gamma_i \in \mathbb{R}^{K \times K}$ for $i = 1, \dots, p-1$ are the parameters and $u_t$ is $K$-dimensional white noise. Both $\alpha$ and $\beta$ have rank $r$ - then so called cointegration rank.

## Choose Deterministic Terms

According to the `terms` in the cointegration relation between sp500 and russell2000, no deterministic terms are needed to add inside the cointegration relation.

In [11]:
deterministic = "n" # "n", "ci", "co", "li", "lo"

## Select Lag Order

In [12]:
# without `.reset_index(drop=True)`: warning - not use date index
lag_order = select_order(data=train_df_, maxlags=10, deterministic=deterministic)
lag_order.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-1.552,-1.542*,0.2119,-1.548
1.0,-1.553,-1.537,0.2117,-1.547
2.0,-1.559,-1.538,0.2103,-1.552*
3.0,-1.559,-1.532,0.2103,-1.550
4.0,-1.559,-1.526,0.2103,-1.548
5.0,-1.560*,-1.520,0.2102*,-1.546
6.0,-1.559,-1.513,0.2104,-1.543
7.0,-1.557,-1.506,0.2107,-1.539
8.0,-1.557,-1.499,0.2108,-1.536


In [13]:
print(lag_order)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 5, BIC -> 0, FPE -> 5, HQIC ->  2>


In [14]:
lag_order.aic, lag_order.bic, lag_order.fpe, lag_order.hqic

(5, 0, 5, 2)

## Cointegration Rank

In [16]:
# `det_order=-1`: no deterministic terms
rank_test = select_coint_rank(train_df_, det_order=-1, k_ar_diff=lag_order.aic, signif=0.05)
rank_test.rank

1

In [17]:
rank_test.summary()

r_0,r_1,test statistic,critical value
0,2,17.87,12.32
1,2,3.588,4.13


## Model

In [18]:
vecm = VECM(train_df_, 
            k_ar_diff=lag_order.aic, 
            coint_rank=rank_test.rank,
            deterministic=deterministic)

In [19]:
vecm_res = vecm.fit()
vecm_res.summary()

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
L1.sp500,-0.0739,0.033,-2.216,0.027,-0.139,-0.009
L1.russell2000,0.0314,0.045,0.699,0.484,-0.057,0.119
L2.sp500,-0.1137,0.033,-3.408,0.001,-0.179,-0.048
L2.russell2000,0.0937,0.045,2.091,0.037,0.006,0.182
L3.sp500,-0.0107,0.033,-0.319,0.750,-0.076,0.055
L3.russell2000,-0.0007,0.045,-0.016,0.987,-0.089,0.087
L4.sp500,-0.0465,0.033,-1.397,0.162,-0.112,0.019
L4.russell2000,0.0332,0.045,0.742,0.458,-0.054,0.121
L5.sp500,-0.0007,0.033,-0.020,0.984,-0.066,0.065

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
L1.sp500,-0.0354,0.025,-1.427,0.154,-0.084,0.013
L1.russell2000,-0.0008,0.033,-0.025,0.980,-0.066,0.065
L2.sp500,-0.0329,0.025,-1.323,0.186,-0.082,0.016
L2.russell2000,0.0340,0.033,1.019,0.308,-0.031,0.099
L3.sp500,0.0258,0.025,1.037,0.300,-0.023,0.075
L3.russell2000,-0.0329,0.033,-0.986,0.324,-0.098,0.032
L4.sp500,-0.0167,0.025,-0.675,0.500,-0.065,0.032
L4.russell2000,-0.0081,0.033,-0.243,0.808,-0.073,0.057
L5.sp500,0.0043,0.025,0.175,0.861,-0.044,0.053

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,-0.0026,0.002,-1.506,0.132,-0.006,0.001

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
ec1,0.0003,0.001,0.272,0.785,-0.002,0.003

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
beta.1,1.0000,0,0,0.000,1.000,1.000
beta.2,-1.7116,0.041,-41.363,0.000,-1.793,-1.630


## Forecast

In [20]:
vecm_res.predict(steps=5)

array([[201.3769892 , 124.89137856],
       [201.44377724, 124.90376934],
       [201.55594528, 124.94169181],
       [201.58485911, 124.94483592],
       [201.63051482, 124.95583711]])

In [21]:
for text, nd in zip(
    ["forecast", "lower", "upper"], 
    vecm_res.predict(steps=5, alpha=0.05) # 95% confidence level
):
    print(f"{text}:\n{nd}")

forecast:
[[201.3769892  124.89137856]
 [201.44377724 124.90376934]
 [201.55594528 124.94169181]
 [201.58485911 124.94483592]
 [201.63051482 124.95583711]]
lower:
[[199.12317948 123.21445385]
 [198.33946675 122.58259367]
 [197.84841587 122.12229291]
 [197.3646093  121.70397235]
 [196.97055175 121.36181156]]
upper:
[[203.63079892 126.56830327]
 [204.54808773 127.22494501]
 [205.2634747  127.76109071]
 [205.80510892 128.18569949]
 [206.29047789 128.54986266]]
