In [38]:
import numpy as np
import pandas as pd
import yfinance as yf
import statsmodels.api as sm
from statsmodels.tsa.vector_ar.vecm import VECM, VAR, select_order, select_coint_rank, coint_johansen
from time_series_utils import TSA
import matplotlib.pyplot as plt

In [2]:
start_date = pd.Timestamp("2000-01-01")
end_date = pd.Timestamp("2019-12-31")

In [15]:
# "SPY": SPDR S&P 500 ETF Trust
# "IWB": iShares Russell 1000 ETF 
sp500_df = yf.download("SPY", start=start_date, end=end_date)
russell1000_df = yf.download("IWB", start=start_date, end=end_date)
display(sp500_df.head())
display(russell1000_df.head())
sp500_df.shape[0], russell1000_df.shape[0]

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-01-03 00:00:00-05:00,148.25,148.25,143.875,145.4375,95.308838,8164300
2000-01-04 00:00:00-05:00,143.53125,144.0625,139.640625,139.75,91.581627,8089800
2000-01-05 00:00:00-05:00,139.9375,141.53125,137.25,140.0,91.745483,12177900
2000-01-06 00:00:00-05:00,139.625,141.5,137.75,137.75,90.270981,6227200
2000-01-07 00:00:00-05:00,140.3125,145.75,140.0625,145.75,95.51358,8066500


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2000-05-19 00:00:00-04:00,75.40625,75.40625,74.3125,74.3125,49.981689,8100
2000-05-22 00:00:00-04:00,74.609375,74.609375,72.078125,74.09375,49.834568,751300
2000-05-23 00:00:00-04:00,74.125,74.125,72.84375,72.84375,48.993839,271200
2000-05-24 00:00:00-04:00,72.890625,73.46875,72.0,73.46875,49.414192,17900
2000-05-25 00:00:00-04:00,74.265625,74.359375,72.921875,72.921875,49.046394,2100


(5030, 4934)

In [4]:
sp500 = sp500_df["Adj Close"]
russell1000 = russell1000_df["Adj Close"]
pair_df = pd.concat([sp500, russell1000], axis=1, join="inner")
pair_df.columns = ["sp500", "russell1000"]
pair_df

Unnamed: 0_level_0,sp500,russell1000
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2000-05-19 00:00:00-04:00,92.717773,49.981716
2000-05-22 00:00:00-04:00,92.019707,49.834576
2000-05-23 00:00:00-04:00,90.664665,48.993855
2000-05-24 00:00:00-04:00,92.142891,49.414185
2000-05-25 00:00:00-04:00,90.562012,49.046368
...,...,...
2019-12-23 00:00:00-05:00,306.215424,170.501556
2019-12-24 00:00:00-05:00,306.224945,170.453659
2019-12-26 00:00:00-05:00,307.855072,171.295959
2019-12-27 00:00:00-05:00,307.778809,171.257645


In [5]:
train_df = pair_df.loc[pair_df.index < "2017-01-01"].reset_index(drop=True)
test_df = pair_df.loc[pair_df.index >= "2017-01-01"].reset_index(drop=True)

In [19]:
train_df.corr()

Unnamed: 0,sp500,russell1000
sp500,1.0,0.999666
russell1000,0.999666,1.0


# VECM / VAR

$$\Delta y_t = \alpha \beta^T y_{t-1} + \Gamma_1 \Delta y_{t-1} + \dots + \Gamma_{p-1} \Delta y_{t-p+1} + u_t$$
where $\alpha, \beta \in \mathbb{R}^{K \times r}$ and $\Gamma_i \in \mathbb{R}^{K \times K}$ for $i = 1, \dots, p-1$ are the parameters and $u_t$ is $K$-dimensional white noise. Both $\alpha$ and $\beta$ have rank $r$ - then so called cointegration rank.

## Choose Deterministic Terms

In [6]:
deterministic = "colo" # "n", "ci", "co", "li", "lo"

## Select Lag Order For SP500 and Russell1000

In [7]:
# without `.reset_index(drop=True)`: warning - not use date index
lag_order = select_order(data=train_df, maxlags=10, deterministic=deterministic)
lag_order.summary()

0,1,2,3,4
,AIC,BIC,FPE,HQIC
0.0,-3.971,-3.956,0.01885,-3.966
1.0,-4.163,-4.142,0.01556,-4.156
2.0,-4.237,-4.209,0.01445,-4.227
3.0,-4.275,-4.242,0.01391,-4.263
4.0,-4.281,-4.242,0.01383,-4.267
5.0,-4.289,-4.244,0.01372,-4.273
6.0,-4.296,-4.245*,0.01362,-4.278*
7.0,-4.295,-4.237,0.01364,-4.275
8.0,-4.294,-4.230,0.01365,-4.272


In [8]:
print(lag_order)

<statsmodels.tsa.vector_ar.var_model.LagOrderResults object. Selected orders are: AIC -> 10, BIC -> 6, FPE -> 10, HQIC ->  6>


In [9]:
lag_order.aic, lag_order.bic, lag_order.fpe, lag_order.hqic

(10, 6, 10, 6)

## Cointegration Rank

In [27]:
# `det_order=-1`: no deterministic terms
rank_test = select_coint_rank(train_df, det_order=-1, k_ar_diff=lag_order.aic, signif=0.05)
rank_test.rank

0

In [29]:
rank_test.summary()

r_0,r_1,test statistic,critical value
0,2,12.11,12.32


## Error Correction Term Not Statistically Significant -> VAR($\Delta y_t$)

In [55]:
train_rtn_df = train_df.pct_change().dropna().reset_index(drop=True)
train_rtn_df.columns = [x + "_rtn" for x in pair_df.columns]
test_rtn_df = test_df.pct_change().dropna().reset_index(drop=True)
test_rtn_df.columns = [x + "_rtn" for x in pair_df.columns]

In [60]:
sp500_rtn_tsa = TSA(train_rtn_df.sp500_rtn.values)
ADF_stats, ADF_p_value, best_lag, terms = sp500_rtn_tsa.ADF_test_complete()
ADF_stats, ADF_p_value, best_lag, terms

(-50.685135980170244, 0.0, 1, 'n')

In [61]:
russell1000_rtn_tsa = TSA(train_rtn_df.russell1000_rtn.values)
ADF_stats, ADF_p_value, best_lag, terms = russell1000_rtn_tsa.ADF_test_complete()
ADF_stats, ADF_p_value, best_lag, terms

(-49.616865117337554, 0.0, 1, 'n')

Both sp500 return and russell1000 return are stationary.  
(Using difference sequences is also reasonable. Here, we just want our variables to have a clearer finanical meaning.)  
Then we choose lag order for sp500 return and russell1000 return and build VAR model.

## Model

In [79]:
var = VAR(train_rtn_df)
var_res = var.fit(maxlags=10, ic="aic") # VAR model chooses best lag order according to `ic` implicitly

In [80]:
var_res.summary()

  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Thu, 02, Feb, 2023
Time:                     19:47:08
--------------------------------------------------------------------
No. of Equations:         2.00000    BIC:                   -21.0281
Nobs:                     4170.00    HQIC:                  -21.0693
Log likelihood:           32184.6    FPE:                6.91684e-10
AIC:                     -21.0919    Det(Omega_mle):     6.84770e-10
--------------------------------------------------------------------
Results for equation sp500_rtn
                         coefficient       std. error           t-stat            prob
--------------------------------------------------------------------------------------
const                       0.000298         0.000190            1.568           0.117
L1.sp500_rtn               -0.453767         0.086866           -5.224           0.000
L1.russell1000_rtn          0.38

## Forecast

In [81]:
lag_order = var_res.k_ar
lag_order

10

In [84]:
var_res.forecast(train_rtn_df.iloc[-lag_order:].values, steps=5)

array([[0.00061862, 0.00053873],
       [0.00051399, 0.00065487],
       [0.00071225, 0.00065491],
       [0.00027241, 0.00027417],
       [0.00058742, 0.00058941]])

**Question:**  
Why is error correction term statistically significant for sp500 and russell2000 pair, but not statistically significant for sp500 and russell1000 pair?

**My Answer:**  
Because error correction term captures the correlation between large cap stocks, while there is a big overlap on constiuent stocks for sp500 and russell1000 and remaining information captured by error correction term is insignificant for indices pnl.