## Pair Selection Cointegration

In [1]:
import os
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller
from IPython.display import display

In [2]:
# Paths (relative to project root)
CLEAN_CSV       = "Data/sp500_prices_clean.csv"
CANDIDATES_CSV  = "Data/pca_candidate_pairs_20pc.csv"
OUTPUT_DIR      = "Output"
OUTPUT_CSV      = os.path.join(OUTPUT_DIR, "cointegrated_pairs.csv")

# Create outputs folder if needed
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ADF p-value threshold for stationarity
ADF_PVALUE_THRESHOLD = 0.05


# 1. Load cleaned price series (dates × tickers)
prices = pd.read_csv(CLEAN_CSV, index_col=0, parse_dates=True)

# 2. Load candidate pairs identified by PCA+NN
pairs_df = pd.read_csv(CANDIDATES_CSV)


# COINTEGRATION TEST

results = []

for _, row in pairs_df.iterrows():
    ti, tj = row["ticker_i"], row["ticker_j"]
    pi = prices[ti]
    pj = prices[tj]

    # Estimate hedge ratio β via OLS (no intercept)
    beta = np.linalg.lstsq(pj.values.reshape(-1, 1), pi.values, rcond=None)[0][0]

    # Compute spread series
    spread = pi - beta * pj

    # ADF test on the spread
    adf_stat, pval, *_ = adfuller(spread.dropna())

    # Keep only if p-value indicates stationarity
    if pval < ADF_PVALUE_THRESHOLD:
        results.append({
            "ticker_i": ti,
            "ticker_j": tj,
            "beta": beta,
            "adf_pvalue": pval
        })


# SAVE & REPORT

coin_df = pd.DataFrame(results)
coin_df.to_csv(OUTPUT_CSV, index=False)

print(f"Tested {len(pairs_df)} candidate pairs.")
print(f"Found {len(coin_df)} cointegrated pairs (p < {ADF_PVALUE_THRESHOLD}).")
print(f"Saved results to '{OUTPUT_CSV}'.")


Tested 1568 candidate pairs.
Found 144 cointegrated pairs (p < 0.05).
Saved results to 'Output/cointegrated_pairs.csv'.


In [3]:


# Load cleaned prices and candidate pairs
prices = pd.read_csv("Data/sp500_prices_clean.csv", index_col=0, parse_dates=True)
pairs = pd.read_csv("Data/pca_candidate_pairs_20pc.csv")

# Take first 5 candidate pairs for preview
sample_pairs = pairs.head(10)


# Compute beta and ADF p-values for the sample
preview_results = []
for _, row in sample_pairs.iterrows():
    ti, tj = row['ticker_i'], row['ticker_j']
    pi = prices[ti]
    pj = prices[tj]
    beta_hat = np.linalg.lstsq(pj.values.reshape(-1,1), pi.values, rcond=None)[0][0]
    spread = pi - beta_hat * pj
    adf_stat, pval, *_ = adfuller(spread.dropna())
    preview_results.append({
        'ticker_i': ti,
        'ticker_j': tj,
        'beta': beta_hat,
        'adf_pvalue': pval
    })

preview_df = pd.DataFrame(preview_results)
display(preview_df)


Unnamed: 0,ticker_i,ticker_j,beta,adf_pvalue
0,A,DHR,0.578182,0.01892
1,A,IQV,0.589075,0.005301
2,A,MTD,0.10073,0.000969
3,A,RVTY,0.923344,0.471161
4,A,TECH,1.515118,0.164544
5,A,TMO,0.249378,0.031101
6,A,WAT,0.392516,0.143174
7,AAPL,ADBE,0.301457,0.980295
8,AAPL,ANSS,0.447377,0.977637
9,AAPL,CDNS,0.796949,0.107438
