In [2]:
from typing import List, Tuple, Optional, Mapping
import pandas as pd
import numpy as np

In [20]:
class IndicesSelector():
    def __init__(self):
        pass

    def select(self, df: pd.DataFrame) -> List[str]:
        pass

In [221]:
class PairsSelector():
    def __init__(self):
        pass

    def select(self, df: pd.DataFrame) -> List[List[str]]:
        training_df = pd.read_csv('TrainingSet.csv')
        pairs = [[p.split("_")[0], p.split("_")[1]] for p in training_df["Ticker_Pair"].unique()]
        return pairs

In [222]:
class Predictor():
    def __init__(self):
        pass
    
    def train(self, data: pd.DataFrame, params: Optional[Mapping] = None):
        pass

    def predict(self, data: pd.DataFrame, params: Optional[Mapping] = None) -> pd.DataFrame:
        pass

    def periodic_train_predict(self, data: pd.DataFrame, params: Optional[Mapping] = None) -> pd.DataFrame:
        pass

In [223]:
class MockPredictor(Predictor):
    def __init__(self):
        pass

    def train(self, data, params: Optional[Mapping] = None):
        pass

    def predict(self, data: pd.DataFrame, params: Optional[Mapping] = None) -> pd.DataFrame:
        return pd.read_csv("mock_evaluation_df.csv")

    def periodic_train_predict(self, data: pd.DataFrame, params: Optional[Mapping] = None) -> pd.DataFrame:
        return self.predict(data, params)
    

In [224]:
class SignalGenerator():
    def __init__(self):
        pass

    def pair_sig_to_asset_sig(self, price_df: pd.DataFrame, signal_df: pd.DataFrame) -> pd.DataFrame:
        assets = price_df.columns
        strategy_asset = [[None] * len(assets)]

        def process_signal(x):
            buy_symbol = x['pair'].split(',')[0]
            sell_symbol = x['pair'].split(',')[1]
            if x['side']:
                strategy_asset[-1][assets.get_loc(buy_symbol)] = True 
                strategy_asset[-1][assets.get_loc(sell_symbol)] = False 
                strategy_asset.append([None] * len(assets))
                strategy_asset[-1][assets.get_loc(buy_symbol)] = False 
                strategy_asset[-1][assets.get_loc(sell_symbol)] = True 
            else:
                strategy_asset[-1][assets.get_loc(buy_symbol)] = False 
                strategy_asset[-1][assets.get_loc(sell_symbol)] = True 
                strategy_asset.append([None] * len(assets))
                strategy_asset[-1][assets.get_loc(buy_symbol)] = True 
                strategy_asset[-1][assets.get_loc(sell_symbol)] = False

        signal_df.iloc[:-1].apply(process_signal, axis=1) 
        return pd.DataFrame(strategy_asset, columns=price_df.columns, index=signal_df.index)


    def generate(self, pairs: List[str], price_df: pd.DataFrame, \
                 predict_df: pd.DataFrame, params: Optional[Mapping] = None)\
                 -> Tuple[pd.DataFrame, pd.DataFrame]:
        pass

In [225]:
class PercentileCurrent(SignalGenerator):
    def __init__(self):
        pass

    def generate(self, pairs: List[str], price_df: pd.DataFrame, \
                 predict_df: pd.DataFrame, params: Optional[Mapping] = None)\
                 -> Tuple[pd.DataFrame, pd.DataFrame]:
        price_df = price_df.pivot_table("ETF Price", ["Date"], columns="ETF_Ticker")
        price_df.index =pd.to_datetime(price_df.index)
        if params is None:
            params = {
                'holding_period': '1M',
                'distribution_period': '60D'
            }
        holding_period = params["holding_period"]
        distribution_period = params["distribution_period"]
        signal_df = pd.DataFrame()
        for pair in pairs:
            col = ','.join(pair)
            signal_df[col] = price_df[pair[0]] - price_df[pair[1]]
        signal_df = signal_df.rolling(distribution_period).apply(lambda x: pd.Series(x).rank(pct=True).iloc[-1]) - 0.5
        signal_df.resample(holding_period).first()
        signal_df['pair'] = np.abs(signal_df).idxmax(axis=1)   
        signal_df['side'] = signal_df.apply(lambda x: x[x["pair"]] < 0, axis=1)
        pair_sig = self.pair_sig_to_asset_sig(price_df, signal_df)
        asset_sig = signal_df[['pair', 'side']]
        return (pair_sig, asset_sig)
        

In [231]:
class MostSpreadReturnPredict(SignalGenerator):
    def __init__(self):
        pass

    def generate(self, pairs: List[str], price_df: pd.DataFrame, \
                 predict_df: pd.DataFrame, params: Optional[Mapping] = None)\
                 -> Tuple[pd.DataFrame, pd.DataFrame]:

        price_df = price_df.pivot_table("ETF Price", ["Date"], columns="ETF_Ticker")
        if params is None:
            params = {
                'holding_period': '1M'
            }
        holding_period = params["holding_period"]
        signal_df = pd.DataFrame()
        predict_df = predict_df.reset_index()
        signal_df = predict_df.iloc[predict_df.groupby('Date')['Predicted Returns'].agg(lambda x: np.abs(x).idxmax())]
        signal_df.set_index("Date", inplace=True)
        signal_df.index = pd.to_datetime(signal_df.index)
        signal_df = signal_df.resample(holding_period).first()
        signal_df.rename(columns={'Pair':'pair'}, inplace=True)
        signal_df['side'] = signal_df['Predicted Returns'] > 0
        return self.pair_sig_to_asset_sig(price_df, signal_df), signal_df[['pair', 'side']]

In [232]:
class PairTradingPipeline():
    """
    @param price_df DataFrame with symbol columns and price values
    @param training_ratio ratio of the subset of the price_df to be used to select paris
    """
    def __init__(self, price_df: pd.DataFrame, training_ratio: float=0.5):
        self.price_df = price_df.copy()
        training_idx = int(len(price_df) * training_ratio)
        self.index_selection_df = self.price_df[:training_idx]
        self.pair_selection_df = self.price_df[training_idx:]
        self.selected_indices = None
        self.indicies_selector = IndicesSelector()
        self.pairs_selector = PairsSelector()
        self.predictors = {
            'naive': MockPredictor()
        }
        self.predict_result = {
            'naive': None
        }
        self.signal_generator = {
            'percentile_current': PercentileCurrent()
        }

    def select_indicies(self) -> List[str]:
        self.selected_indices = self.indicies_selector.select(self.index_selection_df)
        return self.selected_indices

    def select_pairs(self) -> List[List[str]]:
        if self.selected_indices is None:
            self.select_indicies()
        self.selected_pairs = self.pairs_selector.select(self.index_selection_df[[self.selected_indices]])
        return self.selected_pairs 

    def predict(self, predictor: str, pair: List[str], params: Optional[Mapping] = None, periodic: Optional[bool] = False):
        if periodic:
            self.predict_result[predictor] = self.predictors[predictor].periodic_train_predict(self.pair_selection_df[pair])
        else:
            self.predictors[predictor].train(self.pair_selection_df)
            self.predict_result[predictor] = self.predictors[predictor].predict(self.pair_selection_df[pair])
    
    def create_signal(self, signalGenerator: str, predict_df: pd.DataFrame, params: Optional[Mapping] = None):
        if self.selected_pairs is None:
            self.select_pairs()
        self.predictors[signalGenerator].generate(self.selected_pairs, self.price_df, predict_df, params)
        


In [11]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout



In [30]:
training_df = pd.read_csv('TrainingSet.csv')
training_df

Unnamed: 0.1,Unnamed: 0,Date,Div Yield,Price to Book,Price to Earnings,Total_Ret_Pct_1,Total_Ret_Pct_5,Total_Ret_Pct_21,Total_Ret_Pct_63,Total_Ret_Pct_252,Y_Fwd_Total_Ret_Pct_1,Y_Fwd_Total_Ret_Pct_5,Y_Fwd_Total_Ret_Pct_21,Ticker_Pair
0,0,2006-01-03,,,,,,,,,,,,EWZ_ACWI
1,1,2006-01-04,,,,,,,,,,,,EWZ_ACWI
2,2,2006-01-05,,,,,,,,,,,,EWZ_ACWI
3,3,2006-01-06,,,,,,,,,,,,EWZ_ACWI
4,4,2006-01-09,,,,,,,,,,,,EWZ_ACWI
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85604,4298,2023-01-31,0.669840,3.377479,1.396734,0.0155,0.0094,0.0373,-0.0656,0.0561,-0.0292,,,IWB_B2QWDY
85605,4299,2023-02-01,0.644737,3.508506,1.451001,0.0290,0.0290,0.0931,0.0048,0.0628,-0.0313,,,IWB_B2QWDY
85606,4300,2023-02-02,0.624899,3.619865,1.497047,0.0313,0.0522,0.1179,0.0398,0.0860,0.0224,,,IWB_B2QWDY
85607,4301,2023-02-03,0.638793,3.540841,1.463975,-0.0224,0.0293,0.0768,0.0479,0.0599,0.0067,,,IWB_B2QWDY


In [45]:
price_df_raw = pd.read_csv('price_df.csv')
price_df_raw

Unnamed: 0,ETF_Ticker,Date,ETF Price
0,B2QWCY,2009-01-02,18.029589
1,B2QWCY,2013-01-02,31.022942
2,B2QWCY,2014-01-02,42.477444
3,B2QWCY,2015-01-02,44.404617
4,B2QWCY,2018-01-02,62.109604
...,...,...,...
105850,ITA,2015-12-31,59.110000
105851,ITA,2018-12-31,86.430000
105852,ITA,2019-12-31,111.040000
105853,ITA,2020-12-31,94.720000


In [65]:
price_df = price_df_raw.pivot_table("ETF Price", ["Date"], columns="ETF_Ticker")
price_df.index =pd.to_datetime(price_df.index)
pairs = [[p.split("_")[0], p.split("_")[1]] for p in training_df["Ticker_Pair"].unique()]

In [111]:
PercentileCurrent().generate(pairs, price_df, None)

(ETF_Ticker  *XEG  ACWI  ACWX B04X7G B0791H B0M635 B1Y9MZ B2422T B2QWCY B2QWDY  \
 Date                                                                            
 2006-01-03  None  None  None  False   None   None   None   None   None   None   
 2006-01-04  None  None  None  False   None   None   None   None   None   None   
 2006-01-05  None  None  None   True   None   None   None   None   None   None   
 2006-01-06  None  None  None  False   None   None   None   None   None   None   
 2006-01-09  None  None  None  False   None   None   None   None   None   None   
 ...          ...   ...   ...    ...    ...    ...    ...    ...    ...    ...   
 2023-01-31  None  None  None   True   None   None   None   None   None   True   
 2023-02-01  None  None  None   None   None   None   None   None   None   True   
 2023-02-02  None  None  None   None   None   None   None   None   None   True   
 2023-02-03  None  None  None   None   None   None   None   None   None   True   
 2023-02-06  Non

In [253]:
mock_ev = pd.read_csv("mock_evaluation_df.csv")
mock_ev["Pair"] = mock_ev["Pair"].map(lambda x: f'{x.split("_")[0]},{x.split("_")[1]}')
mock_price = mock_ev.copy()
tickers = ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "GGG", "HHHH", "QQQ", "PPP"]
mock_price["ETF_Ticker"] = mock_price.index % len(tickers)
mock_price["ETF_Ticker"] = mock_price["ETF_Ticker"].map(lambda x: tickers[x])
mock_price["ETF Price"] = 0.01
mock_ev.set_index("Date", inplace=True)
mock_ev.index = pd.to_datetime(mock_ev.index)

In [254]:
mock_price

Unnamed: 0.1,Unnamed: 0,Date,Pair,Predicted Returns,Actual Returns,ETF_Ticker,ETF Price
0,0,2015-01-02,"AAA,BBB",-0.000441,-0.002764,AAA,0.01
1,1,2015-01-03,"AAA,BBB",-0.017459,-0.028117,BBB,0.01
2,2,2015-01-04,"AAA,BBB",-0.002401,-0.009652,CCC,0.01
3,3,2015-01-05,"AAA,BBB",-0.012099,-0.003490,DDD,0.01
4,4,2015-01-06,"AAA,BBB",-0.002275,-0.008374,EEE,0.01
...,...,...,...,...,...,...,...
15331,15331,2021-12-27,"QQQ,PPP",-0.009146,0.021598,BBB,0.01
15332,15332,2021-12-28,"QQQ,PPP",0.004890,0.029214,CCC,0.01
15333,15333,2021-12-29,"QQQ,PPP",0.006402,-0.010940,DDD,0.01
15334,15334,2021-12-30,"QQQ,PPP",-0.011812,-0.027799,EEE,0.01


In [256]:
MostSpreadReturnPredict().generate(mock_ev["Pair"].unique(), mock_price, mock_ev)

(ETF_Ticker    AAA    BBB    CCC    DDD    EEE    FFF    GGG   HHHH    PPP  \
 Date                                                                        
 2015-01-31   None   None   None   None   None   None   None   None   True   
 2015-02-28   None   None   None   None   None   None  False   True  False   
 2015-03-31  False   True   None   None   None   None   True  False   None   
 2015-04-30   True  False   None   None   None   None   None   None   None   
 2015-05-31  False   True   None   None   None   None   None   None   True   
 ...           ...    ...    ...    ...    ...    ...    ...    ...    ...   
 2021-08-31   None   True  False   None   True  False   None   None   None   
 2021-09-30   None   None  False   True  False   True   None   None   None   
 2021-10-31   None   None   True  False   None   None   None   None   None   
 2021-11-30   None   None  False   True   True  False   None   None   None   
 2021-12-31   None   None   None   None  False   True   None   N