In [1]:
import numpy as np
from numpy.linalg import norm
import pandas as pd
import itertools
import random
from matplotlib import pyplot as plt
from pair_trading_foundations.data_generation import ExecutePairTrading, generate_training_data
random.seed(23)
import cProfile
import pstats
import pickle
import plotly.express as px
from time import time
import warnings
warnings.filterwarnings('ignore')

def chunker(seq, size):
    # split a list into chunks
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# data = pd.read_csv('Data/sp500_full_20181231_to_20231229.csv')
data = pd.read_csv('Data/sp500_full_19991201_to_20231231.csv')
value_count_tb = data[['Ticker']].groupby('Ticker').size().reset_index()
value_count_tb.columns = ['Ticker', 'Count']
stock_to_keep = value_count_tb['Ticker'][value_count_tb.Count==value_count_tb.Count.max()]
data = data[data.Ticker.isin(stock_to_keep)]

# Generate for all pairs

In [4]:
tickers = list(set(data.Ticker.values))
combinations = list(itertools.combinations(tickers, 2))
len(combinations)
batches = list(chunker(combinations, 1000))

In [5]:
len(data)

2169122

In [79]:
from importlib import reload
import pair_trading_foundations.data_generation
reload(pair_trading_foundations.data_generation)
from pair_trading_foundations.data_generation import ExecutePairTrading, generate_training_data

i = 0
for batch in batches:
    start_ts=time()
    print(f'Getting {i+1}th out of {len(batches)} batches')
    features_tb, labels_tb = generate_training_data(
        data=data,
        moving_average=50,
        training_len=300,
        test_len=500,
        entry_signal=3,
        exit_signal=1.5,
        calculate_label=True,
        verbose=False,
        combinations=batch
    )
    combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
    combined = combined[combined.pnls.notnull()].reset_index(drop=True)
    combined.to_csv(f'Data/Training/pair_features{i+1}_300_120.csv', index=False)
    end_ts = time()
    print(f"Took {end_ts - start_ts} seconds")
    i+=1
    break
    
len(combined)
combined

Getting 1th out of 64 batches
1000 stock pairs detected
Took 0.4914987087249756 to initilize. Entering ticker pair loop
Max combination = 1
Took 16.087382793426514 to finish
Took 16.172106742858887 seconds


Unnamed: 0,Ticker_P1,Date,High_P1,Low_P1,Volume_P1,Close_P1,Ticker_P2,High_P2,Low_P2,Volume_P2,Close_P2,same_sector_flag,same_sub_industry_flag,abs_spread,abs_spread_mean,abs_spread_std,abs_spread_mean_MA,abs_spread_std_MA,pnls
0,WELL,2001-02-08 00:00:00,18.850000,18.709999,61000.0,5.180556,JBHT,4.796875,4.656250,348400.0,3.770645,False,False,1.409912,1.296872,0.508011,1.228968,0.342294,0.155197
1,WELL,2001-02-09 00:00:00,18.900000,18.620001,78500.0,5.128313,JBHT,4.781250,4.656250,1077600.0,3.859216,False,False,1.269097,1.297921,0.507626,1.217006,0.329757,0.088307
2,WELL,2001-02-12 00:00:00,18.910000,18.639999,76700.0,5.142061,JBHT,4.765625,4.515625,561600.0,3.859216,False,False,1.282845,1.299126,0.507158,1.204147,0.313696,0.055565
3,WELL,2001-02-13 00:00:00,18.980000,18.650000,58600.0,5.197054,JBHT,4.812500,4.722656,1248400.0,3.868706,False,False,1.328349,1.300767,0.506450,1.193864,0.300493,0.032560
4,WELL,2001-02-14 00:00:00,19.000000,18.870001,75600.0,5.221804,JBHT,4.812500,4.656250,446400.0,3.783297,False,False,1.438506,1.302190,0.506235,1.186129,0.288654,0.044420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5254,WELL,2021-12-28 00:00:00,84.849998,83.610001,1224800.0,79.186035,JBHT,204.380005,202.080002,260300.0,199.752716,False,False,120.566681,90.672100,13.757516,114.376366,3.095849,0.043831
5255,WELL,2021-12-29 00:00:00,85.330002,83.339996,1219600.0,79.541161,JBHT,206.839996,203.149994,243000.0,201.673889,False,False,122.132729,90.842301,13.829883,114.665619,3.131907,0.029754
5256,WELL,2021-12-30 00:00:00,86.349998,84.930000,1380000.0,80.279480,JBHT,205.789993,203.279999,203900.0,199.693909,False,False,119.414429,90.995508,13.891040,114.870229,3.100501,0.029754
5257,WELL,2021-12-31 00:00:00,86.690002,85.580002,1301100.0,80.157967,JBHT,205.470001,201.490005,239800.0,200.350616,False,False,120.192650,91.148429,13.959162,115.026271,3.168738,0.029754


In [69]:
labels_tb
labels_tb.pnls.describe()

count    5259.000000
mean        0.040240
std         0.045568
min        -0.270019
25%         0.018242
50%         0.031239
75%         0.051036
max         0.293395
Name: pnls, dtype: float64

In [57]:
len(labels_tb)
labels_tb = labels_tb.dropna()
labels_tb

Unnamed: 0,Date,Ticker_P1,Ticker_P2,pnls
300,2001-02-08 00:00:00,WELL,JBHT,0.214885
301,2001-02-09 00:00:00,WELL,JBHT,0.088307
302,2001-02-12 00:00:00,WELL,JBHT,0.055565
303,2001-02-13 00:00:00,WELL,JBHT,0.032560
304,2001-02-14 00:00:00,WELL,JBHT,0.044420
...,...,...,...,...
5554,2021-12-28 00:00:00,WELL,JBHT,0.029754
5555,2021-12-29 00:00:00,WELL,JBHT,0.029754
5556,2021-12-30 00:00:00,WELL,JBHT,0.029754
5557,2021-12-31 00:00:00,WELL,JBHT,0.029754


In [None]:
data.head(5)

# Generate data for sampled pairs

In [None]:
sampled_tickers = random.sample(list(stock_to_keep.values), 50)
# data_tech = data[data['GICS Sector'].isin(['Information Technology'])]
data_sampled = data[data['Ticker'].isin(sampled_tickers)]

In [None]:
features_tb, labels_tb, pnl_metadata_tb = generate_training_data(
        data=data_sampled,
        training_len=300,
        test_len=20,
        calculate_label=True,
        verbose=False
    )

# Write data out

In [None]:
spy_df = pd.read_csv('Data/Training/1999-12-01-2023-12-31_SPY.csv')
spy_df = spy_df[['Date','Adj Close']]
spy_df.columns = ['Date','SPY_Close']

look_forward_d = 20
# Define a variable to calculate the return if we just buy SPY and sell in the next 60 days
spy_return = []
for i in range(spy_df.shape[0]):
    if (i + look_forward_d) < spy_df.shape[0]:
        spy_return.append(
            100*(spy_df.loc[i+look_forward_d]['SPY_Close'] - spy_df.loc[i]['SPY_Close'])/spy_df.loc[i]['SPY_Close']
        )
    else:
        spy_return.append(
            np.nan
        )
spy_df['SPY_return'] = spy_return

In [None]:
combined = pd.merge(features_tb, labels_tb, how='left', on=['Date', 'Ticker_P1','Ticker_P2']).reset_index(drop=True)
combined = pd.merge(combined, spy_df[['Date','SPY_return']], how='left', on='Date')
combined['pnls'] = combined.pnls * 100
combined['successful_pair_trading'] = (combined.both_legs_profited) & (combined.pnls > combined.SPY_return)
combined.both_legs_profited = combined.both_legs_profited.astype(int, errors='ignore')

In [None]:
combined.to_csv(f'Data/Training/pair_features_updated_300_20.csv', index=False)

In [None]:
# combined = pd.merge(combined, pnl_metadata_tb[['Date', 'Ticker_P1','Ticker_P2', 'trade_executions']], how='left', on=['Date', 'Ticker_P1','Ticker_P2'])

In [None]:
combined.columns

In [None]:
1539825/1225

In [None]:
with open('Data/spotcheckout_output.pkl','wb') as file:
    pickle.dump(combined, file)