In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import statsmodels as sm
from pathlib import Path
import yfinance as yf
from tqdm import tqdm

from correlation_helper import *

In [2]:
num_top = 50
num_vote = 20
corr_type = 'beta'
target = 'USGO'
interval = '3D'

company_path = Path(f'../data/top_{num_top}_companies_by_sector.csv').resolve()
corr_path = Path(f'../data/{corr_type}_matrix_2024-01-01_2025-01-01_top{num_top}.csv').resolve()

analysis_start = '2024-01-01'
analysis_end = '2024-12-31'

In [3]:
company = pd.read_csv(company_path)
corr = pd.read_csv(corr_path).set_index('Ticker')[[target]]
corr

Unnamed: 0_level_0,USGO
Ticker,Unnamed: 1_level_1
UFPI,-0.464907
HYMC,0.602544
ABAT,-0.188233
HYMCL,-0.257207
USAU,0.559888
...,...
INFN,-0.379417
CDZIP,0.369749
NNE,
NWE,-0.687845


In [4]:
corr_data = (corr
             .assign(abs_corr = corr[target].abs())
             .dropna()
             .sort_values(by='abs_corr', ascending=False)
             )
corr_data = corr_data[corr_data[target] < 1][:num_vote]
print(f'Number of positive: {len(corr_data[corr_data[target] > 0])}')
print(f'Number of negative: {len(corr_data[corr_data[target] < 0])}')
corr_data

Number of positive: 19
Number of negative: 1


Unnamed: 0_level_0,USGO,abs_corr
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
HOLX,0.949133,0.949133
CHRD,0.940426,0.940426
PCVX,0.929393,0.929393
HALO,0.928119,0.928119
PRDO,0.913779,0.913779
WSC,0.908697,0.908697
EXAS,0.908288,0.908288
AFRM,0.907148,0.907148
ARLP,0.902322,0.902322
TPG,0.885059,0.885059


In [13]:
corr_stock = yf.download(
    [target]+ corr_data.index.tolist(),
    start=analysis_end,
    ).dropna(axis=1, how='all')['Close']
corr_stock.index = pd.to_datetime(corr_stock.index)
corr_stock = corr_stock.resample(interval).last().dropna()
corr_stock

[*********************100%***********************]  21 of 21 completed


Ticker,AFRM,ARLP,CDZI,CHRD,CPRT,CSX,EWBC,EXAS,FYBR,HALO,...,HPKEW,INCY,KLXE,LFUS,PCVX,PNFP,PRDO,TPG,USGO,WSC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,62.509998,25.944275,5.07,118.889999,56.32,32.029392,93.919533,56.93,34.830002,48.009998,...,4.48,69.529999,5.44,234.054718,83.300003,113.304024,25.995575,63.20216,8.83,33.252388
2025-01-03,66.5,25.905275,5.12,120.949997,56.91,32.20866,94.933586,57.25,35.060001,47.740002,...,4.7,69.629997,5.28,238.17337,84.860001,115.200409,25.925936,65.124588,9.06,33.870941
2025-01-06,58.669998,25.768778,4.67,122.599998,56.779999,32.069229,94.794395,56.68,35.310001,53.759998,...,3.806,73.129997,5.11,236.378311,81.059998,115.270279,26.244289,64.490387,7.34,33.940777
2025-01-09,55.310001,26.509766,5.12,123.779999,55.639999,31.650934,92.965126,56.5,35.099998,52.799999,...,3.8,70.540001,5.21,234.473557,81.080002,112.0065,25.736916,61.834667,8.25,34.90852
2025-01-12,55.349998,26.412266,4.96,126.699997,56.220001,32.009472,97.935989,54.200001,35.529999,53.830002,...,4.27,71.93,5.63,229.975967,78.139999,117.74556,26.642233,62.52832,8.2,34.93845
2025-01-15,57.990002,27.163004,5.17,126.309998,56.900002,32.597076,100.212646,51.740002,35.639999,54.799999,...,4.01,71.93,6.65,232.528931,85.300003,119.68187,27.527655,66.422714,9.69,36.554676
2025-01-21,56.759998,28.157484,4.89,120.739998,57.610001,33.533257,102.896912,55.130001,35.790001,55.630001,...,4.01,72.529999,6.56,236.049225,89.974998,122.107239,27.955442,69.137894,8.929,38.988995
2025-01-24,55.73,28.508478,4.54,117.459999,57.700001,32.557236,102.330231,55.98,35.799999,55.77,...,3.834,72.589996,6.0,234.363861,89.639999,123.045456,28.164362,68.672157,9.14,38.769508
2025-01-27,58.950001,28.069736,4.79,116.769997,57.32,32.846062,102.648369,54.889999,35.799999,55.830002,...,3.4,73.720001,5.67,234.593246,88.669998,125.26123,28.03503,66.308762,9.18,37.113373
2025-01-30,61.07,27.981989,4.94,112.449997,57.93,32.736507,102.370003,56.049999,35.759998,56.639999,...,3.93,74.160004,5.29,237.704651,88.32,124.532616,28.631945,66.640724,9.26,36.973701


In [6]:
stock_diff = corr_stock.pct_change(axis=0).apply(lambda x: x >= 0).dropna()
stock_diff.head()

Ticker,AFRM,ARLP,CDZI,CHRD,CPRT,CSX,EWBC,EXAS,FYBR,HALO,...,HPKEW,INCY,KLXE,LFUS,PCVX,PNFP,PRDO,TPG,USGO,WSC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2025-01-03,True,False,True,True,True,True,True,True,True,False,...,True,True,False,True,True,True,False,True,True,True
2025-01-06,False,False,False,True,False,False,False,False,True,True,...,False,True,False,False,False,True,True,False,False,True
2025-01-09,False,True,True,True,False,False,False,False,False,False,...,False,False,True,False,True,False,False,False,True,True
2025-01-12,True,False,False,True,True,True,True,False,True,True,...,True,True,True,False,False,True,True,True,False,True


In [7]:
target_trend = stock_diff[target]
relate_vote = stock_diff.drop(columns=[target])
relate_vote = relate_vote.transform(lambda col: col if corr[target][col.name] > 0 else -col, axis=0)
relate_vote.head()

Ticker,AFRM,ARLP,CDZI,CHRD,CPRT,CSX,EWBC,EXAS,FYBR,HALO,HOLX,HPKEW,INCY,KLXE,LFUS,PCVX,PNFP,PRDO,TPG,WSC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2024-12-31,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False
2025-01-03,True,False,True,True,True,True,True,True,True,False,True,False,True,False,True,True,True,False,True,True
2025-01-06,False,False,False,True,False,False,False,False,True,True,False,True,True,False,False,False,True,True,False,True
2025-01-09,False,True,True,True,False,False,False,False,False,False,False,True,False,True,False,True,False,False,False,True
2025-01-12,True,False,False,True,True,True,True,False,True,True,False,False,True,True,False,False,True,True,True,True


In [8]:
votes = (relate_vote.mean(axis=1) - 0.5) * 200
votes = votes.rename('votes')
votes

Date
2024-12-31   -90.0
2025-01-03    50.0
2025-01-06   -20.0
2025-01-09   -30.0
2025-01-12    30.0
2025-01-15    70.0
2025-01-21    50.0
2025-01-24     0.0
2025-01-27    10.0
2025-01-30     0.0
2025-02-02   -40.0
2025-02-05   -20.0
2025-02-08    20.0
2025-02-11   -20.0
2025-02-14     0.0
2025-02-17     0.0
2025-02-20   -60.0
2025-02-23   -30.0
2025-02-26    10.0
2025-03-01   -70.0
2025-03-04   -50.0
Name: votes, dtype: float64

In [9]:
results = pd.concat([target_trend, votes], axis=1)
results = results.assign(power = votes.abs())
# for t in [20]:
for t in range(10, 91, 5):
    direction = results['votes'] > 0
    is_vote = (results[target] == direction) & (results['power'] >= t)
    # is_vote = (results['power'] > t)
    guess = results[target][is_vote]

    # results[f'direction_{t}'] = direction
    # results[f'is_vote_{t}'] = is_vote
    results[f'guess_{t}'] = guess
    results[f'shift_{t}'] = results[f'guess_{t}'].shift(1)
    results[f'out_{t}'] = results[f'shift_{t}'] == results[target]
    results[f'out_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()]
    results[f'inc_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()][results[f'shift_{t}'] == True]
    results[f'dec_{t}'] = results[f'out_{t}'][~results[f'shift_{t}'].isna()][results[f'shift_{t}'] == False]
    results = results.drop(columns = [f'guess_{t}', f'shift_{t}'])
results

Unnamed: 0_level_0,USGO,votes,power,out_10,inc_10,dec_10,out_15,inc_15,dec_15,out_20,...,dec_75,out_80,inc_80,dec_80,out_85,inc_85,dec_85,out_90,inc_90,dec_90
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-12-31,False,-90.0,90.0,,,,,,,,...,,,,,,,,,,
2025-01-03,True,50.0,50.0,False,,False,False,,False,False,...,False,False,,False,False,,False,False,,False
2025-01-06,False,-20.0,20.0,False,False,,False,False,,False,...,,,,,,,,,,
2025-01-09,True,-30.0,30.0,False,,False,False,,False,,...,,,,,,,,,,
2025-01-12,False,30.0,30.0,,,,,,,,...,,,,,,,,,,
2025-01-15,True,70.0,70.0,,,,,,,,...,,,,,,,,,,
2025-01-21,False,50.0,50.0,False,False,,False,False,,False,...,,,,,,,,,,
2025-01-24,True,0.0,0.0,,,,,,,,...,,,,,,,,,,
2025-01-27,True,10.0,10.0,,,,,,,,...,,,,,,,,,,
2025-01-30,True,0.0,0.0,True,True,,,,,,...,,,,,,,,,,


In [10]:
final = pd.concat([
    results.loc[:, results.columns.str.contains('out')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('inc')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('dec')].mean(axis=0).reset_index(drop=True),
], axis=1, ignore_index=True)
final.columns = ['out', 'inc', 'dec']
final.index = results.loc[:, results.columns.str.contains('out')].mean(axis=0).index.str.replace('out_', '')
final


Unnamed: 0,out,inc,dec
10,0.5,0.5,0.5
15,0.444444,0.333333,0.5
20,0.5,0.0,0.75
25,0.5,0.0,0.75
30,0.5,0.0,0.75
35,0.4,0.0,0.666667
40,0.4,0.0,0.666667
45,0.4,0.0,0.666667
50,0.4,0.0,0.666667
55,0.5,0.0,0.666667


In [11]:
decrease_power = -10
increase_power = 50

results = pd.concat([target_trend, votes], axis=1)
results = results.assign(power = votes.abs())

direction = results['votes'] > 0
is_vote = (
    (results[target] == False) & (results['votes'] <= decrease_power) |
    (results[target] == True) & (results['votes'] >= increase_power)
)
# is_vote = (results[target] == direction) & (results['power'] > t)
guess = results[target][is_vote]

# results[f'direction_{t}'] = direction
# results[f'is_vote_{t}'] = is_vote
results[f'guess'] = guess
results[f'shift'] = results[f'guess'].shift(1)
results[f'out'] = results[f'shift'] == results[target]
results[f'out'] = results[f'out'][~results[f'shift'].isna()]
results[f'inc'] = results[f'out'][results[f'shift'] == True]
results[f'dec'] = results[f'out'][results[f'shift'] == False]
results = results.drop(columns = [f'guess', f'shift'])
results

Unnamed: 0_level_0,USGO,votes,power,out,inc,dec
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2024-12-31,False,-90.0,90.0,,,
2025-01-03,True,50.0,50.0,False,,False
2025-01-06,False,-20.0,20.0,False,False,
2025-01-09,True,-30.0,30.0,False,,False
2025-01-12,False,30.0,30.0,,,
2025-01-15,True,70.0,70.0,,,
2025-01-21,False,50.0,50.0,False,False,
2025-01-24,True,0.0,0.0,,,
2025-01-27,True,10.0,10.0,,,
2025-01-30,True,0.0,0.0,,,


In [12]:
final = pd.concat([
    results.loc[:, results.columns.str.contains('out')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('inc')].mean(axis=0).reset_index(drop=True),
    results.loc[:, results.columns.str.contains('dec')].mean(axis=0).reset_index(drop=True),
], axis=1, ignore_index=True)
final.columns = ['out', 'inc', 'dec']
final.index = results.loc[:, results.columns.str.contains('out')].mean(axis=0).index.str.replace('out_', '')
final


Unnamed: 0,out,inc,dec
out,0.375,0.0,0.5
