In [8]:
import pandas as pd
from itertools import combinations
import numpy as np
import statsmodels.api as sm

from quant.utils.cointegraion import do_adf_regression, do_aeg_regression

# Константы
correlation_threshold = 0.90
probability_threshold = 90

open_threshold = 0.75
close_threshold = 0.25

tickers = pd.read_parquet('../../../../../data/sp500_stocks.parquet')
tickers.set_index('ticker', inplace=True)
tickers

Unnamed: 0_level_0,company,sector,industry
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MMM,3M,Industrials,Industrial Conglomerates
AOS,A. O. Smith,Industrials,Building Products
ABT,Abbott Laboratories,Health Care,Health Care Equipment
ABBV,AbbVie,Health Care,Biotechnology
ACN,Accenture,Information Technology,IT Consulting & Other Services
...,...,...,...
XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components
YUM,Yum! Brands,Consumer Discretionary,Restaurants
ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments
ZBH,Zimmer Biomet,Health Care,Health Care Equipment


In [9]:
history = pd.read_parquet('../../../../../data/sp500_2023-01-01_2025-01-01.parquet')['Adj Close']
live = pd.read_parquet('../../../../../data/sp500_2025-01-01_2025-07-01.parquet')['Adj Close']

history = pd.concat([history, live], axis=0)

training_start_date = '2023-01-01'
training_end_date = '2024-01-01'
training_set = history.loc[training_start_date:training_end_date].copy()

sample_start_date = '2024-01-02'
sample_end_date = '2025-01-01'
sample_set = history.loc[sample_start_date:sample_end_date].copy()

out_of_sample_start_date = '2025-01-02'
out_of_sample_end_date = '2025-07-01'
out_of_sample_set = history.loc[out_of_sample_start_date:out_of_sample_end_date].copy()

In [10]:
training_combs = pd.DataFrame(combinations(training_set.columns, 2), columns=['ticker1', 'ticker2'])
sample_combs = pd.DataFrame(combinations(sample_set.columns, 2), columns=['ticker1', 'ticker2'])
out_of_sample_combs = pd.DataFrame(combinations(out_of_sample_set.columns, 2), columns=['ticker1', 'ticker2'])
print(len(training_combs))

126253


In [11]:
training_combs['correlation'] = training_combs.apply(lambda x: np.corrcoef(training_set[x['ticker1']], training_set[x['ticker2']])[0, 1], axis=1)
training_combs = training_combs[training_combs['correlation'] > correlation_threshold]

sample_combs['correlation'] = sample_combs.apply(lambda x: np.corrcoef(sample_set[x['ticker1']], sample_set[x['ticker2']])[0, 1], axis=1)
sample_combs = sample_combs[sample_combs['correlation'] > correlation_threshold]

out_of_sample_combs['correlation'] = out_of_sample_combs.apply(lambda x: np.corrcoef(out_of_sample_set[x['ticker1']], out_of_sample_set[x['ticker2']])[0, 1], axis=1)
out_of_sample_combs = out_of_sample_combs[out_of_sample_combs['correlation'] > correlation_threshold]

def enrich_data(combs):
    combs[['ticker1_company', 'ticker1_sector']] = combs['ticker1'].apply(lambda x: pd.Series(tickers[tickers.index == x][['company', 'sector']].values[0]))
    combs[['ticker2_company', 'ticker2_sector']] = combs['ticker2'].apply(lambda x: pd.Series(tickers[tickers.index == x][['company', 'sector']].values[0]))
    combs.sort_values(by='correlation', ascending=False, inplace=True)
    combs = combs[['ticker1', 'ticker1_company', 'ticker1_sector', 'ticker2', 'ticker2_company', 'ticker2_sector', 'correlation']]


enrich_data(training_combs)
enrich_data(sample_combs)
enrich_data(out_of_sample_combs)

training_combs

Unnamed: 0,ticker1,ticker2,correlation,ticker1_company,ticker1_sector,ticker2_company,ticker2_sector
82888,GOOG,GOOGL,0.999787,Alphabet Inc. (Class C),Communication Services,Alphabet Inc. (Class A),Communication Services
113692,NWS,NWSA,0.996551,News Corp (Class B),Communication Services,News Corp (Class A),Communication Services
58497,DHI,LEN,0.991272,D. R. Horton,Consumer Discretionary,Lennar,Consumer Discretionary
77425,FOX,FOXA,0.986705,Fox Corporation (Class B),Communication Services,Fox Corporation (Class A),Communication Services
12032,AMAT,LRCX,0.983236,Applied Materials,Information Technology,Lam Research,Information Technology
...,...,...,...,...,...,...,...
7707,AFL,MCK,0.900066,Aflac,Financials,McKesson Corporation,Health Care
125828,VST,WELL,0.900055,Vistra Corp.,Utilities,Welltower,Real Estate
856,AAPL,PANW,0.900055,Apple Inc.,Information Technology,Palo Alto Networks,Information Technology
30207,BMY,ENPH,0.900046,Bristol Myers Squibb,Health Care,Enphase Energy,Information Technology


## Определяем коинтеграция по Augmented Dickey-Fuller

In [12]:
training_combs['adf_probability'] = training_combs.apply(lambda x: do_adf_regression(training_set, x['ticker1'], x['ticker2']), axis=1)
sample_combs['adf_probability'] = sample_combs.apply(lambda x: do_adf_regression(sample_set, x['ticker1'], x['ticker2']), axis=1)
out_of_sample_combs['adf_probability'] = out_of_sample_combs.apply(lambda x: do_adf_regression(out_of_sample_set, x['ticker1'], x['ticker2']), axis=1)


training_combs = training_combs[training_combs['adf_probability'].notna()]
sample_combs = sample_combs[sample_combs['adf_probability'].notna()]
out_of_sample_combs = out_of_sample_combs[out_of_sample_combs['adf_probability'].notna()]

training_combs = training_combs[training_combs['adf_probability'] > probability_threshold]
sample_combs = sample_combs[sample_combs['adf_probability'] > probability_threshold]
out_of_sample_combs = out_of_sample_combs[out_of_sample_combs['adf_probability'] > probability_threshold]

training_combs

Unnamed: 0,ticker1,ticker2,correlation,ticker1_company,ticker1_sector,ticker2_company,ticker2_sector,adf_probability
58497,DHI,LEN,0.991272,D. R. Horton,Consumer Discretionary,Lennar,Consumer Discretionary,99.0
12032,AMAT,LRCX,0.983236,Applied Materials,Information Technology,Lam Research,Information Technology,95.0
105661,MCO,SPGI,0.981452,Moody's Corporation,Financials,S&P Global,Financials,99.0
48650,CPRT,MSFT,0.981109,Copart,Industrials,Microsoft,Information Technology,99.0
31664,BRO,MMC,0.979469,Brown & Brown,Financials,Marsh McLennan,Financials,95.0
...,...,...,...,...,...,...,...,...
67803,ENPH,PFE,0.900153,Enphase Energy,Information Technology,Pfizer,Health Care,95.0
113111,NTRS,SBAC,0.900141,Northern Trust,Financials,SBA Communications,Real Estate,95.0
119723,RCL,SPGI,0.900113,Royal Caribbean Group,Consumer Discretionary,S&P Global,Financials,95.0
125828,VST,WELL,0.900055,Vistra Corp.,Utilities,Welltower,Real Estate,99.0


## Определяем коинтеграция по Augmented Engle-Granger

In [13]:
training_combs['aeg_probability'] = training_combs.apply(lambda x: do_aeg_regression(training_set, x['ticker1'], x['ticker2']), axis=1)
sample_combs['aeg_probability'] = sample_combs.apply(lambda x: do_aeg_regression(sample_set, x['ticker1'], x['ticker2']), axis=1)
out_of_sample_combs['aeg_probability'] = out_of_sample_combs.apply(lambda x: do_aeg_regression(out_of_sample_set, x['ticker1'], x['ticker2']), axis=1)

training_combs = training_combs[training_combs['aeg_probability'].notna()]
sample_combs = sample_combs[sample_combs['aeg_probability'].notna()]
out_of_sample_combs = out_of_sample_combs[out_of_sample_combs['aeg_probability'].notna()]

training_combs = training_combs[training_combs['aeg_probability'] > probability_threshold]
sample_combs = sample_combs[sample_combs['aeg_probability'] > probability_threshold]
out_of_sample_combs = out_of_sample_combs[out_of_sample_combs['aeg_probability'] > probability_threshold]

training_combs.sort_values(by=['aeg_probability', 'adf_probability', 'correlation'], ascending=False, inplace=True)
sample_combs.sort_values(by=['aeg_probability', 'adf_probability', 'correlation'], ascending=False, inplace=True)
out_of_sample_combs.sort_values(by=['aeg_probability', 'adf_probability', 'correlation'], ascending=False, inplace=True)

training_combs = training_combs[['ticker1', 'ticker2', 'correlation', 'adf_probability', 'aeg_probability', 'ticker1_company', 'ticker1_sector','ticker2_company', 'ticker2_sector']]
sample_combs = sample_combs[['ticker1', 'ticker2', 'correlation', 'adf_probability', 'aeg_probability', 'ticker1_company', 'ticker1_sector','ticker2_company', 'ticker2_sector']]
out_of_sample_combs = out_of_sample_combs[['ticker1', 'ticker2', 'correlation', 'adf_probability', 'aeg_probability', 'ticker1_company', 'ticker1_sector','ticker2_company', 'ticker2_sector']]

sample_combs

Unnamed: 0,ticker1,ticker2,correlation,adf_probability,aeg_probability,ticker1_company,ticker1_sector,ticker2_company,ticker2_sector
118013,PNC,RF,0.991726,99.0,99.0,PNC Financial Services,Financials,Regions Financial Corporation,Financials
39448,CFG,TFC,0.990103,99.0,99.0,Citizens Financial Group,Financials,Truist Financial,Financials
41412,CINF,NDAQ,0.986448,99.0,99.0,Cincinnati Financial,Financials,"Nasdaq, Inc.",Financials
5323,ADP,PAYX,0.985586,99.0,99.0,Automatic Data Processing,Industrials,Paychex,Industrials
48886,CPT,EQR,0.984645,99.0,99.0,Camden Property Trust,Real Estate,Equity Residential,Real Estate
...,...,...,...,...,...,...,...,...,...
84762,GWW,NTRS,0.902308,95.0,95.0,W. W. Grainger,Industrials,Northern Trust,Financials
62897,DVA,NRG,0.902042,95.0,95.0,DaVita,Health Care,NRG Energy,Utilities
84683,GWW,KEY,0.901237,95.0,95.0,W. W. Grainger,Industrials,KeyCorp,Financials
104663,MAR,TPR,0.900544,95.0,95.0,Marriott International,Consumer Discretionary,"Tapestry, Inc.",Consumer Discretionary


In [14]:
training_combs['comb'] = training_combs.apply(lambda x: f"{x['ticker1']}_{x['ticker2']}", axis=1)
sample_combs['comb'] = sample_combs.apply(lambda x: f"{x['ticker1']}_{x['ticker2']}", axis=1)
out_of_sample_combs['comb'] = out_of_sample_combs.apply(lambda x: f"{x['ticker1']}_{x['ticker2']}", axis=1)


tr_combs = training_combs['comb'].to_numpy()
sa_combs = sample_combs['comb'].to_numpy()
oos_combs = out_of_sample_combs['comb'].to_numpy()

temp = np.intersect1d(tr_combs, sa_combs)
intersection = np.intersect1d(temp, oos_combs)
intersection

array([], dtype=object)