In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import NearestNeighbors
from IPython.display import display

## PCA with 20 components and candidate pair filtering


In [7]:
# 1) Load cleaned price data (10-year daily adjusted closes)
prices = pd.read_csv("Data/sp500_prices_clean.csv", index_col=0, parse_dates=True)

# 2) Compute daily simple returns and standardize
returns = prices.pct_change().dropna()
scaler = StandardScaler()
returns_scaled = pd.DataFrame(
    scaler.fit_transform(returns),
    index=returns.index,
    columns=returns.columns
)

# 3) PCA embedding with 20 principal components
n_components = 20
pca = PCA(n_components=n_components)
embeddings = pca.fit_transform(returns_scaled.T)  # shape = (n_tickers, n_components)
embeddings_df = pd.DataFrame(
    embeddings,
    index=returns_scaled.columns,
    columns=[f"PC{i+1}" for i in range(n_components)]
)

# 4) Save PCA embeddings
embeddings_df.to_csv("Data/pca_embeddings_20pc.csv")

# 5) Display cumulative explained variance
explained = pd.Series(
    pca.explained_variance_ratio_.cumsum(),
    index=[f"PC1..PC{i+1}" for i in range(n_components)],
    name="Cumulative Explained Variance"
)
display(explained)

# 6) Candidate pairs via 5-NN in PCA space
k_neighbors = 5
nbrs = NearestNeighbors(n_neighbors=k_neighbors + 1, algorithm="auto").fit(embeddings_df.values)
distances, indices = nbrs.kneighbors(embeddings_df.values)

pairs = set()
tickers = embeddings_df.index
for i, ticker in enumerate(tickers):
    for neighbor_idx in indices[i][1:]:
        neighbor_ticker = str(tickers[neighbor_idx])
        pair = tuple(sorted([str(ticker), neighbor_ticker]))
        pairs.add(pair)
pairs_list = sorted(pairs)

# 7) Save candidate pairs
cand_df = pd.DataFrame(pairs_list, columns=["ticker_i", "ticker_j"])
cand_df.to_csv("Data/pca_candidate_pairs_20pc.csv", index=False)

# 8) Display results
print(f"PCA embeddings saved to data/pca_embeddings_20pc.csv")
print(f"Total candidate pairs identified with 20 PCs: {len(pairs_list)}")
display(embeddings_df.head())


PC1..PC1     0.083563
PC1..PC2     0.147119
PC1..PC3     0.179245
PC1..PC4     0.208209
PC1..PC5     0.228547
PC1..PC6     0.247810
PC1..PC7     0.264004
PC1..PC8     0.278590
PC1..PC9     0.291552
PC1..PC10    0.302967
PC1..PC11    0.313818
PC1..PC12    0.324164
PC1..PC13    0.333284
PC1..PC14    0.341224
PC1..PC15    0.348714
PC1..PC16    0.356038
PC1..PC17    0.363242
PC1..PC18    0.370047
PC1..PC19    0.376465
PC1..PC20    0.382620
Name: Cumulative Explained Variance, dtype: float64

PCA embeddings saved to data/pca_embeddings_20pc.csv
Total candidate pairs identified with 20 PCs: 1568


Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
A,0.267814,13.454074,0.975077,-6.498644,2.211398,3.372235,2.514132,11.840944,2.965063,-4.041386,-3.155666,-4.274598,3.573032,-6.467953,-5.818936,-1.658727,3.286465,3.547739,-0.072888,-0.583184
AAPL,-0.584792,17.083443,-0.520222,-0.508664,1.388502,5.500516,-5.532746,-5.159201,1.122778,-1.21468,2.406796,-1.201884,-6.4199,3.667526,2.220747,0.108454,-0.675871,2.379666,-0.941851,-1.130634
ABBV,6.432043,2.668159,13.114579,1.819186,9.822688,-4.939985,3.831336,8.347855,6.654378,0.117289,5.003951,-1.589319,-2.664982,2.058828,2.220681,-7.258594,-3.39661,3.945699,5.871314,-1.550177
ABT,10.535195,11.135882,5.513221,-9.428304,8.355397,0.445629,1.752958,10.601081,4.361465,-3.876133,-4.062489,-3.799962,-0.353175,-0.492893,-2.791174,-3.549442,0.343701,-1.401815,3.199215,0.952113
ACGL,0.314608,-13.392712,-1.429299,-7.612791,10.62477,-0.963458,-1.540804,-7.701081,1.161206,5.680977,3.094275,3.773908,8.344616,0.690675,-9.042751,-4.347709,-4.021045,-6.056199,-5.336872,-1.739228
