In [2]:
# Hands-on Assignment: Advanced Statistical Reasoning in Quant Signals
# ----------------------------------------------------------------------
# This notebook is designed to help you APPLY the statistical tools you've learned.
# It covers cointegration, Granger causality, regime filtering, hypothesis testing,
# entropy, surprise detection, IC scoring, and risk-adjusted evaluation.
#
# You will use real financial data (from yfinance) to build, test, and validate alpha signals.
# This file contains comments only — you must implement all code.



In [48]:

# -------------------------------
# STEP 0: IMPORTS + SETUP
# -------------------------------
# Import all necessary libraries: pandas, numpy, yfinance, matplotlib, scipy.stats, statsmodels
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import yfinance as yf
import scipy.stats as sc
import statsmodels.api as sm
from statsmodels.tsa.stattools import coint, adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from itertools import combinations

In [100]:
# -------------------------------
# STEP 1: DATA ACQUISITION
# -------------------------------
# Download daily adjusted close price data for the following tickers over the last 5 years:
# ['SPY', 'QQQ', 'IWM', 'AAPL', 'MSFT', 'NVDA', 'VIX', 'XLK', 'XLF']
# Use yfinance. Align the data on dates. Drop missing values.
tickers = ['SPY', 'QQQ', 'IWM', 'AAPL', 'MSFT', 'NVDA', '^VIX', 'XLK', 'XLF']
data_dict = {}

for ticker in tickers:
  data = yf.Ticker(ticker)
  data_dict[ticker] = data.history(
    start="2010-01-01",
    end="2017-01-01",    # end is exclusive, so this gives you through 2016-12-31
    interval="1d",       # daily bars
    auto_adjust=True     # optional: get adjusted prices
    )


In [101]:
# -------------------------------
# STEP 2: COINTEGRATION TESTING
# -------------------------------
# Choose any two tickers (e.g., SPY and QQQ)
# 1. Test if they are non-stationary (ADF test)
# 2. Run the Engle-Granger cointegration test between them
# 3. If cointegrated, plot the residual spread over time
# 4. Mark Z-score ±2 levels on the spread plot (for future signal ideas)


In [102]:
def stationarity_summary(series):
  adf_stat, adf_p, _, _, _, _ = adfuller(series)
  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')
  return pd.Series({
      "ADF_p": adf_p,
      "ADF_stat": adf_stat,
      "KPSS_p": kpss_p,
      "KPSS_stat": kpss_stat
      })

In [103]:

Xt = data_dict['AAPL']['Close']
Yt = data_dict['SPY']['Close']
X_results = stationarity_summary(Xt)
Y_results = stationarity_summary(Yt)


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')


In [104]:
def compute_charts(Series):
  plt.figure(figsize=(10,6))
  plt.plot(Series, label="Level")
  plt.legend()
  plt.show()

  plot_acf(Series, lags=40); plt.title("ACF of Level"); plt.show()
  plot_pacf(Series, lags=40); plt.title("PACF of Level"); plt.show()

In [105]:
def compute_residuals(X, Y):
  X = sm.add_constant(X)
  model = sm.OLS(Y, X).fit()
  return model.resid

In [36]:
dXt = Xt.diff().dropna()
dYt = Yt.diff().dropna()

In [37]:
resid_dX_dY = compute_residuals(dXt, dYt)

In [106]:
coint_pairs = []

In [107]:
for ticker1, ticker2 in combinations(tickers, 2):
  Xt = data_dict[ticker1]['Close']
  Yt = data_dict[ticker2]['Close']
  stats, p_val, _ = coint(Xt, Yt)
  if p_val <= 0.05:
    coint_pairs.append((ticker1, ticker2))

In [108]:
data_dict['^VIX'].index = data_dict['^VIX'].index.tz_convert("America/New_York").normalize()

In [109]:
common = data_dict['XLK']['Close'].index.intersection(data_dict['^VIX']['Close'].index)

In [110]:
stat_err_pairs = []

In [111]:
for pair in coint_pairs:
  ticker1 = pair[0]
  ticker2 = pair[1]
  Xt = data_dict[ticker1]['Close']
  Yt = data_dict[ticker2]['Close']
  residuals = compute_residuals(Xt, Yt)
  summary_tests = stationarity_summary(residuals)
  if (summary_tests.ADF_p < 0.05) | (summary_tests.KPSS_p > 0.05):
    stat_err_pairs.append((ticker1, ticker2, residuals))


look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')
look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')


In [112]:
def stationarity_summary_trend(series):
  adf_stat, adf_p, _, _, _, _ = adfuller(series, regression="ct")
  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')
  return pd.Series({
      "ADF_p": adf_p,
      "ADF_stat": adf_stat,
      "KPSS_p": kpss_p,
      "KPSS_stat": kpss_stat
      })

In [115]:
print(coint_pairs)

[('^VIX', 'XLK'), ('^VIX', 'XLF')]


In [121]:
Xt = data_dict[coint_pairs[0][0]]['Close']
Yt = data_dict[coint_pairs[0][1]]['Close']

In [116]:
resid = compute_residuals(data_dict[coint_pairs[0][0]]['Close'], data_dict[coint_pairs[0][1]]['Close'])

In [117]:
stationarity_summary_trend(resid)

look-up table. The actual p-value is smaller than the p-value returned.

  kpss_stat, kpss_p, _, _ = kpss(series, regression='c')


Unnamed: 0,0
ADF_p,0.059524
ADF_stat,-3.342724
KPSS_p,0.01
KPSS_stat,5.356834


In [132]:
resid_lag = resid.shift().fillna(0)

In [133]:
dXt = Xt.diff().fillna(0)
dYt = Yt.diff().fillna(0)

In [135]:
ecm_df = pd.DataFrame({"dy": dYt, "dx": dXt, "resid_lag": resid_lag})

In [136]:
print("NaNs per column:\n", ecm_df.isna().sum())
print("Infinite values per column:\n", np.isinf(ecm_df.values).sum(axis=0))

NaNs per column:
 dy           0
dx           0
resid_lag    0
dtype: int64
Infinite values per column:
 [0 0 0]


In [137]:
ecm_model = sm.OLS(ecm_df["dy"], sm.add_constant(ecm_df[["dx", "resid_lag"]])).fit()
print("\nError Correction Model Summary:")
print(ecm_model.summary())


Error Correction Model Summary:
                            OLS Regression Results                            
Dep. Variable:                     dy   R-squared:                       0.533
Model:                            OLS   Adj. R-squared:                  0.532
Method:                 Least Squares   F-statistic:                     1003.
Date:                Sun, 18 May 2025   Prob (F-statistic):          1.88e-291
Time:                        03:13:15   Log-Likelihood:                 336.59
No. Observations:                1762   AIC:                            -667.2
Df Residuals:                    1759   BIC:                            -650.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.01

Almost 0 coef for resid_lag, no mean reverting behavior
But, strong coef of delta dx: There might be causalty
We will go ahead of testing for Grangers.