In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels as sm
from pathlib import Path
import yfinance as yf
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

from correlation_helper import *

In [6]:
# Define paths and parameters
data_path1 = Path('../data/pearson_matrix_2024-01-01_2025-01-01_top20.csv').resolve()
data_path2 = Path('../data/top_20_companies_by_sector.csv').resolve()
market_ticker = '^GSPC'
analysis_start = "2024-01-01"
analysis_end = "2025-01-01"

# Import data
data1 = pd.read_csv(data_path1)  # Pearson correlation matrix
data2 = pd.read_csv(data_path2)  # Top 20 companies by sector

# Download stock data
stock_symbols = data2['Symbol'].to_list()
stock_data = yf.download(stock_symbols, start=analysis_start, end=analysis_end)['Close'] 
market_data = yf.download(market_ticker, start=analysis_start, end=analysis_end)['Close']  
print(stock_data)

market_returns = standardize(pct_change(market_data.to_numpy().flatten()))

# Group stocks by sector
sector_groups = data2.groupby('Sector')
sector_results = {}


[*********************100%***********************]  233 of 233 completed

6 Failed downloads:
['MVSTW', 'USGOW', 'NIOBW', 'DHCNL', 'HYMCW']: YFPricesMissingError('possibly delisted; no price data found  (1d 2024-01-01 -> 2025-01-01)')
['SFD']: YFPricesMissingError('possibly delisted; no price data found  (1d 2024-01-01 -> 2025-01-01) (Yahoo error = "Data doesn\'t exist for startDate = 1704085200, endDate = 1735707600")')
[*********************100%***********************]  1 of 1 completed

Ticker            AAPL   ABAT        ABNB  ACTG        ADBE         ADP  ADTN  \
Date                                                                            
2024-01-02  184.532089  4.405  134.479996  3.92  580.070007  228.405411  7.43   
2024-01-03  183.150391  4.230  133.419998  3.89  571.789978  227.514969  7.14   
2024-01-04  180.824356  4.250  133.720001  3.82  567.049988  228.640259  7.22   
2024-01-05  180.098694  4.150  135.979996  3.84  564.599976  230.156967  7.16   
2024-01-08  184.452560  4.120  140.080002  3.89  580.549988  231.673645  7.39   
...                ...    ...         ...   ...         ...         ...   ...   
2024-12-24  257.916443  2.600  134.990005  4.51  447.940002  296.459991  8.40   
2024-12-26  258.735504  3.140  135.320007  4.44  450.160004  297.230011  8.49   
2024-12-27  255.309296  3.360  133.384995  4.37  446.480011  296.179993  8.21   
2024-12-30  251.923019  2.710  131.809998  4.33  445.799988  292.970001  8.30   
2024-12-31  250.144974  2.46




In [70]:
# Iterate over each sector and standardize
standardized_stock = {}
for sector, group in sector_groups:
    sector_symbols = group['Symbol'].tolist()
    sector_returns = pd.DataFrame(index=range(stock_data.shape[0]-1), columns=group['Symbol'])
    
    for symbol in sector_symbols:
        sector_returns[symbol] = standardize(pct_change(stock_data[symbol].values))
        
    standardized_stock[sector] = sector_returns
print(standardized_stock)



    

{'Basic Materials': Symbol      UFPI      AMWD       IPX        EU      USGO      USAU        NB  \
0      -1.734524 -1.364095  0.156557 -0.787688 -0.934298  0.064911 -0.866952   
1      -0.262326 -0.098358 -0.298612  0.564268  0.219462 -0.239840 -0.034828   
2      -0.104754 -0.074218  0.551757 -0.793834 -0.289118 -0.731273 -0.349192   
3       0.970102  0.547350 -0.378645  0.487401 -0.036632 -0.875127  0.348360   
4      -0.752111 -0.203908 -0.575575  1.441092  0.137512  0.982056  0.153876   
..           ...       ...       ...       ...       ...       ...       ...   
246     0.300438  0.363428  0.444845  0.088795  0.315158 -0.298592  0.829893   
247     0.132880  0.339287  0.845696  0.625784 -0.009775  1.444438  0.027794   
248    -0.736501 -0.643153  1.164504 -0.264220 -0.181034 -0.172414 -0.741566   
249    -0.003244 -0.438085  3.801940 -0.532018  0.088148 -1.521543  0.896735   
250     0.203449 -0.203685 -2.723669  0.269205 -0.181202 -0.874232  0.986233   

Symbol      ABAT   

In [None]:
# loop through each sector dataframe and create correlation between and save each one in a dictionary
pearson_dict = {}
beta_dict = {}
r_squared_dict = {}

for name, sector in tqdm(standardized_stock.items()):
    n_stocks = len(sector.columns)
    pearson = pd.DataFrame(np.eye(n_stocks), columns=sector.columns, index=sector.columns)
    beta = pd.DataFrame(np.eye(n_stocks), columns=sector.columns, index=sector.columns)
    r_squared = pd.DataFrame(np.eye(n_stocks), columns=sector.columns, index=sector.columns)

    for i in tqdm(range(n_stocks)):
        for j in range(i+1, n_stocks):
            # Get clean paired returns
            clean_returns1, clean_returns2 = remove_outliers(sector.iloc[:,i], sector.iloc[:,j])
        
            if clean_returns1.shape[0] < 230:
                pearson.iloc[i,j] = pearson.iloc[j,i] = np.nan
                beta.iloc[i,j] = beta.iloc[j,i] = np.nan
                r_squared.iloc[i,j] = r_squared.iloc[j,i] = np.nan
                continue
            
            # Calculate all metrics at once
            p_corr, b_corr, r2_corr = compute_correlation_metrics(
                clean_returns1, clean_returns2, market_returns, 60)
        
            # Fill both sides of symmetric matrices
            pearson.iloc[i,j] = pearson.iloc[j,i] = p_corr
            beta.iloc[i,j] = beta.iloc[j,i] = b_corr
            r_squared.iloc[i,j] = r_squared.iloc[j,i] = r2_corr

    pearson_dict[name] = pearson
    beta_dict[name] = beta
    r_squared_dict[name] = r_squared




100%|██████████| 13/13 [00:01<00:00, 10.81it/s]
100%|██████████| 20/20 [00:04<00:00,  4.35it/s]
100%|██████████| 20/20 [00:04<00:00,  4.59it/s]
100%|██████████| 20/20 [00:04<00:00,  4.27it/s]
100%|██████████| 20/20 [00:04<00:00,  4.29it/s]
100%|██████████| 20/20 [00:04<00:00,  4.12it/s]
 50%|█████     | 6/12 [00:24<00:27,  4.50s/it]

In [None]:
print(pearson_dict['Technology'])

0.4402197443899758
