In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels as sm
from pathlib import Path
import yfinance as yf
from tqdm import tqdm

from correlation_helper import *

In [2]:
num_top = 50
num_vote = 20
corr_type = 'beta'
target = 'AAPL'
interval = '3D'

company_path = Path(f'../data/top_{num_top}_companies_by_sector.csv').resolve()
corr_path = Path(f'../data/{corr_type}_matrix_2024-01-01_2025-01-01_top{num_top}.csv').resolve()

analysis_start = '2024-01-01'
analysis_end = '2024-12-31'

In [3]:
company = pd.read_csv(company_path)
corr = pd.read_csv(corr_path).set_index('Ticker')[[target]]
corr

Unnamed: 0_level_0,AAPL
Ticker,Unnamed: 1_level_1
UFPI,-0.034487
HYMC,0.269600
ABAT,0.849086
HYMCL,-0.385371
USAU,0.761102
...,...
INFN,0.690865
CDZIP,0.576359
NNE,
NWE,-0.184580


In [4]:
corr_data = (corr
             .assign(abs_corr = corr[target].abs())
             .dropna()
             .sort_values(by='abs_corr', ascending=False)
             )
corr_data = corr_data[corr_data[target] < 1][:num_vote]
print(f'Number of positive: {len(corr_data[corr_data[target] > 0])}')
print(f'Number of negative: {len(corr_data[corr_data[target] < 0])}')
corr_data

Number of positive: 20
Number of negative: 0


Unnamed: 0_level_0,AAPL,abs_corr
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
DDOG,0.980256,0.980256
GFS,0.980213,0.980213
OPCH,0.976043,0.976043
LRCX,0.975052,0.975052
HST,0.974054,0.974054
WING,0.970883,0.970883
AVGO,0.970879,0.970879
MKSI,0.968285,0.968285
MSFT,0.966605,0.966605
FFIV,0.965774,0.965774


In [5]:
corr_stock = yf.download(
    [target]+ corr_data.index.tolist(),
    start=analysis_end,
    ).dropna(axis=1, how='all')['Close']
corr_stock.index = pd.to_datetime(corr_stock.index)
corr_stock = corr_stock.resample(interval).last().dropna()
corr_stock

YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  21 of 21 completed


Ticker,AAPL,AVGO,DDOG,FFIV,GFS,HST,KLAC,LLYVK,LRCX,MCHP,...,MKSI,MSFT,OPCH,POOL,REG,ROAD,ROIC,SBRA,TXN,WING
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,243.582199,231.979996,143.630005,251.75,42.48,17.200001,635.185181,68.239998,72.222038,56.456219,...,105.793739,417.742371,22.700001,332.869995,72.209999,87.82,17.360001,16.743345,185.577667,292.059998
2025-01-03,243.092728,232.550003,144.830002,254.360001,43.18,17.450001,655.529236,68.040001,74.884003,57.24033,...,108.068878,422.502838,23.57,337.269989,73.029999,91.580002,17.379999,16.94957,189.111542,298.01001
2025-01-06,242.433441,229.309998,141.880005,258.839996,41.619999,16.959999,684.653503,67.5,76.808197,55.959946,...,110.922775,423.710419,24.139999,324.790009,70.150002,90.269997,17.389999,16.871008,190.253098,278.470001
2025-01-09,236.589874,224.309998,140.419998,254.320007,40.959999,17.08,681.520508,66.480003,74.425385,54.778809,...,108.837234,418.111633,23.99,326.399994,68.830002,87.120003,17.35,16.29162,188.754181,274.690002
2025-01-12,233.023788,224.699997,139.160004,258.109985,41.169998,17.280001,699.679504,68.519997,74.834152,56.456219,...,110.872879,414.838226,28.719999,339.679993,70.660004,86.980003,17.379999,16.497841,192.07959,273.440002
2025-01-15,229.727417,237.440002,138.399994,263.950012,43.099998,17.059999,755.762817,70.290001,80.058388,57.538097,...,119.853676,428.171478,29.16,350.26001,71.220001,90.230003,17.450001,16.635324,191.007523,272.920013
2025-01-21,223.414368,240.279999,140.199997,272.910004,41.669998,17.43,759.234985,71.339996,80.985588,59.126175,...,119.424591,445.816071,30.0,362.059998,71.07,82.580002,17.450001,16.448742,199.137405,288.529999
2025-01-24,222.535324,244.699997,140.990005,272.98999,41.669998,16.940001,748.130066,72.019997,79.440254,55.969868,...,117.249245,443.171387,30.33,353.73999,71.769997,84.459999,17.440001,16.615685,184.158173,279.829987
2025-01-27,239.097122,206.350006,145.720001,300.459991,41.380001,16.59,711.363098,73.040001,74.923882,55.612549,...,108.218559,441.444824,30.950001,344.700012,71.330002,79.309998,17.450001,16.281799,178.946716,303.700012
2025-01-30,235.740814,221.270004,142.710007,297.26001,41.470001,16.709999,736.576172,73.589996,80.806129,53.895439,...,113.038246,414.229431,30.92,344.25,71.839996,80.400002,17.469999,16.40946,184.610001,297.899994


In [6]:
stock_diff = corr_stock.pct_change(axis=0).apply(lambda x: x >= 0).dropna()
stock_diff.head()

Ticker,AAPL,AVGO,DDOG,FFIV,GFS,HST,KLAC,LLYVK,LRCX,MCHP,...,MKSI,MSFT,OPCH,POOL,REG,ROAD,ROIC,SBRA,TXN,WING
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2025-01-03,False,True,True,True,True,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
2025-01-06,False,False,False,True,False,False,True,False,True,False,...,True,True,True,False,False,False,True,False,True,False
2025-01-09,False,False,False,False,False,True,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
2025-01-12,False,True,False,True,True,True,True,True,True,True,...,True,False,True,True,True,False,True,True,True,False


In [7]:
target_trend = stock_diff[target]
relate_vote = stock_diff.drop(columns=[target])
relate_vote = relate_vote.transform(lambda col: col if corr[target][col.name] > 0 else -col, axis=0)
relate_vote.head()

Ticker,AVGO,DDOG,FFIV,GFS,HST,KLAC,LLYVK,LRCX,MCHP,MIDD,MKSI,MSFT,OPCH,POOL,REG,ROAD,ROIC,SBRA,TXN,WING
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-12-31,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2025-01-03,True,True,True,True,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True
2025-01-06,False,False,True,False,False,True,False,True,False,True,True,True,True,False,False,False,True,False,True,False
2025-01-09,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False
2025-01-12,True,False,True,True,True,True,True,True,True,True,True,False,True,True,True,False,True,True,True,False


In [8]:
votes = (relate_vote.mean(axis=1) - 0.5) * 200
votes = votes.rename('votes')
votes

Date
2024-12-31   -100.0
2025-01-03     90.0
2025-01-06    -10.0
2025-01-09    -80.0
2025-01-12     60.0
2025-01-15     60.0
2025-01-21     50.0
2025-01-24      0.0
2025-01-27    -30.0
2025-01-30     30.0
2025-02-02    -10.0
2025-02-05    -10.0
2025-02-08     50.0
2025-02-11     30.0
Name: votes, dtype: float64

In [9]:
results = pd.concat([target_trend, votes], axis=1)
results = results.assign(power = votes.abs())
# for t in [20]:
for t in range(10, 91, 5):
    direction = results['votes'] > 0
    is_vote = (results[target] == direction) & (results['power'] >= t)
    # is_vote = (results['power'] > t)
    guess = results[target][is_vote]

    # results[f'direction_{t}'] = direction
    # results[f'is_vote_{t}'] = is_vote
    results[f'guess_{t}'] = guess
    results[f'shift_{t}'] = results[f'guess_{t}'].shift(1)
    results[f'out_{t}'] = results[f'shift_{t}'] == results[target]
    results[f'out_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()]
    results[f'inc_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()][results[f'shift_{t}'] == True]
    results[f'dec_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()][results[f'shift_{t}'] == False]
    results = results.drop(columns = [f'guess_{t}', f'shift_{t}'])
results

Unnamed: 0_level_0,AAPL,votes,power,out_10,inc_10,dec_10,out_15,inc_15,dec_15,out_20,...,dec_75,out_80,inc_80,dec_80,out_85,inc_85,dec_85,out_90,inc_90,dec_90
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,False,-100.0,100.0,,,,,,,,...,,,,,,,,,,
2025-01-03,False,90.0,90.0,True,,True,True,,True,True,...,True,True,,True,True,,True,True,,True
2025-01-06,False,-10.0,10.0,,,,,,,,...,,,,,,,,,,
2025-01-09,False,-80.0,80.0,,,,,,,,...,,,,,,,,,,
2025-01-12,False,60.0,60.0,True,,True,True,,True,True,...,True,True,,True,,,,,,
2025-01-15,False,60.0,60.0,,,,,,,,...,,,,,,,,,,
2025-01-21,False,50.0,50.0,,,,,,,,...,,,,,,,,,,
2025-01-24,False,0.0,0.0,,,,,,,,...,,,,,,,,,,
2025-01-27,True,-30.0,30.0,,,,,,,,...,,,,,,,,,,
2025-01-30,False,30.0,30.0,,,,,,,,...,,,,,,,,,,


In [10]:
final = pd.concat([
    results.loc[:, results.columns.str.contains('out')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('inc')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('dec')].mean(axis=0).reset_index(drop=True),
], axis=1, ignore_index=True)
final.columns = ['out', 'inc', 'dec']
final.index = results.loc[:, results.columns.str.contains('out')].mean(axis=0).index.str.replace('out_', '')
final


Unnamed: 0,out,inc,dec
10,1.0,1.0,1.0
15,1.0,1.0,1.0
20,1.0,1.0,1.0
25,1.0,1.0,1.0
30,1.0,1.0,1.0
35,1.0,1.0,1.0
40,1.0,1.0,1.0
45,1.0,1.0,1.0
50,1.0,1.0,1.0
55,1.0,,1.0


In [11]:
decrease_power = -10
increase_power = 50

results = pd.concat([target_trend, votes], axis=1)
results = results.assign(power = votes.abs())

direction = results['votes'] > 0
is_vote = (
    (results[target] == False) & (results['votes'] <= decrease_power) |
    (results[target] == True) & (results['votes'] >= increase_power)
)
# is_vote = (results[target] == direction) & (results['power'] > t)
guess = results[target][is_vote]

# results[f'direction_{t}'] = direction
# results[f'is_vote_{t}'] = is_vote
results[f'guess'] = guess
results[f'shift'] = results[f'guess'].shift(1)
results[f'out'] = results[f'shift'] == results[target]
results[f'out'] = results[f'out'][~results[f'shift'].isna()]
results[f'inc'] = results[f'out'][results[f'shift'] == True]
results[f'dec'] = results[f'out'][results[f'shift'] == False]
results = results.drop(columns = [f'guess', f'shift'])
results

Unnamed: 0_level_0,AAPL,votes,power,out,inc,dec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-12-31,False,-100.0,100.0,,,
2025-01-03,False,90.0,90.0,True,,True
2025-01-06,False,-10.0,10.0,,,
2025-01-09,False,-80.0,80.0,,,
2025-01-12,False,60.0,60.0,True,,True
2025-01-15,False,60.0,60.0,,,
2025-01-21,False,50.0,50.0,,,
2025-01-24,False,0.0,0.0,,,
2025-01-27,True,-30.0,30.0,,,
2025-01-30,False,30.0,30.0,,,


In [12]:
final = pd.concat([
    results.loc[:, results.columns.str.contains('out')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('inc')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('dec')].mean(axis=0).reset_index(drop=True),
], axis=1, ignore_index=True)
final.columns = ['out', 'inc', 'dec']
final.index = results.loc[:, results.columns.str.contains('out')].mean(axis=0).index.str.replace('out_', '')
final


Unnamed: 0,out,inc,dec
out,1.0,1.0,1.0
