In [None]:
import pandas as pd
import yfinance as yf
from datetime import datetime
import numpy as np
import seaborn as sn
from matplotlib.pyplot import figure
from statsmodels.tsa.stattools import coint
from statsmodels.tsa.stattools import adfuller

In [None]:
def get_sp500_tickers():
    # Read the S&P 500 tickers table from Wikipedia
    url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
    table = pd.read_html(url)[0]  # This extracts the first table
    tickers = table['Symbol'].tolist()  # Get tickers from 'Symbol' column
    return tickers

In [None]:
def get_historical_data(tickers):
    data = pd.DataFrame()
    for i in tickers:
        stock_data = yf.download(i, start="2022-01-01", end="2024-09-10")  # Use yfinance to download data
        data[i] = stock_data['Adj Close']  # Store adjusted close price
    return data

In [None]:
def adf_test(series):
    result = adfuller(series)
    return result[1]  # Return the p-value

In [None]:
def cointegration_test(data):
    pairs = []
    results = []
    
    tickers = data.columns
    for i in range(len(tickers)):
        for j in range(i + 1, len(tickers)):
            x = data[tickers[i]].dropna()
            y = data[tickers[j]].dropna()
            
            # Align the two series by their index
            x, y = x.align(y, join='inner')
            
            # Check for missing values after alignment
            if x.empty or y.empty:
                continue
            
            # Perform cointegration test
            try:
                score, p_value, _ = coint(x, y)
            except ValueError:
                # Skip if there's an issue with the data
                continue
            
            # Check if the spread is stationary
            spread = x - y
            adf_p_value = adf_test(spread)
            
            # Store results
            pairs.append((tickers[i], tickers[j]))
            results.append((tickers[i], tickers[j], score, p_value, adf_p_value))
    
    # Create a DataFrame with results
    results_df = pd.DataFrame(results, columns=['Stock 1', 'Stock 2', 'Cointegration Score', 'Cointegration p-value', 'ADF p-value'])
    return results_df

In [None]:
# Get the list of S&P 500 tickers
ticks = get_sp500_tickers()

# Fetch historical data for all tickers
historical_data = get_historical_data(ticks)

# Perform cointegration test on stock pairs
results_df = cointegration_test(historical_data)

# Filter pairs with the lowest ADF p-values (indicating stationarity)
filtered_results = results_df[results_df['ADF p-value'] < 0.05]

# Sort by the cointegration score or p-value to find the most cointegrated pairs
sorted_results = filtered_results.sort_values(by='Cointegration Score', ascending=False)

# Display the top results
print(historical_data)
print(sorted_results.head(10))