In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import requests
from io import StringIO
from itertools import combinations
from statsmodels.tsa.stattools import coint
import warnings
warnings.filterwarnings("ignore")

In [2]:
def get_sp500_tickers():
    sp500_tickers = pd.read_html('https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')[0]['Symbol'].tolist()
    sp500_tickers = [ticker for ticker in sp500_tickers if ticker not in ['BF.B', 'BRK.B']]
    return sp500_tickers


def get_stock_prices(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)
    stock_prices = data['Adj Close']
    stock_prices.dropna(axis=1, inplace=True)
    return stock_prices


def find_cointegrated_pairs(stock_prices):
    n = len(stock_prices.columns)
    pairs = list(combinations(stock_prices.columns, 2))
    cointegration_scores = {}
    for pair in pairs:
        stock1 = stock_prices[pair[0]]
        stock2 = stock_prices[pair[1]]
        stock1 = stock1.dropna()
        stock2 = stock2.dropna()
        
        score, p_value, _ = coint(stock1, stock2)
        cointegration_scores[pair] = {'score': score, 'p_value': p_value}
    return cointegration_scores


def select_top_pairs(cointegration_scores, top_n):
    sorted_pairs = sorted(cointegration_scores.items(), key=lambda x: x[1]['score'], reverse=True)
    top_pairs = sorted_pairs[:top_n]
    return top_pairs

In [None]:
start_date = '2023-01-01'
end_date = '2024-01-01'
top_n = 10

sp500_tickers = get_sp500_tickers()

stock_prices = get_stock_prices(sp500_tickers, start_date, end_date)

cointegration_scores = find_cointegrated_pairs(stock_prices)

top_pairs = select_top_pairs(cointegration_scores, top_n)

print("Top", top_n, "cointegrated pairs:")
for pair in top_pairs:
    print(pair)

[*********************100%%**********************]  500 of 500 completed
