In [1]:
import os

def get_file_pairs():
    files_by_directory = {}
    for directory in os.listdir('data'):
        files_by_directory[directory] = []
        for filename in os.listdir('data/' + directory):
            if filename.endswith('.csv'):
                 files_by_directory[directory].append(
                     'data/' + directory + '/' + filename)
    file_pairs = []
    directories = list(files_by_directory.keys())
    for directory_1 in directories:
        remaining_directories = directories[directories.index(directory_1)+1:]
        for directory_2 in remaining_directories:
            for filename_1 in files_by_directory[directory_1]:
                for filename_2 in files_by_directory[directory_2]:
                    if filename_1.replace(
                        ('data/' + directory_1 + '/' + directory_1
                        ), '') in filename_2:
                        file_pairs.append([filename_1, filename_2])                        
    return file_pairs

In [2]:
import pandas as pd

def get_df(filename):
    df = pd.read_csv(filename, index_col=0)
    return df

In [3]:
from ta import add_all_ta_features

def engineer_features(df):
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close',
                             'base_volume', fillna=True)
    df = df.drop(columns=['open', 'high', 'low', 'momentum_kama',
                          'momentum_stoch', 'others_cr', 'others_dlr',
                          'trend_ema_fast', 'trend_ema_slow', 
                          'trend_ichimoku_a', 'trend_ichimoku_b', 'trend_kst',
                          'trend_macd', 'trend_visual_ichimoku_a',
                          'trend_visual_ichimoku_b', 'volatility_bbh',
                          'volatility_bbl', 'volatility_bbm',
                          'volatility_dch', 'volatility_dcl',
                          'volatility_kcc', 'volatility_kch',
                          'volatility_kcl'])
    df['time_since_last'] = df['closing_time'].diff(-1)
    return df

In [4]:
def get_higher_closing_price(df, exchange_1, exchange_2):
    if (df['close_' + exchange_1] - df['close_' + exchange_2]) > 0:
        return exchange_1
    elif (df['close_' + exchange_1] - df['close_' + exchange_2]) < 0:
        return exchange_2
    else:
        return 'equivalent'

def get_pct_higher(df, exchange_1, exchange_2):
    if df['higher_closing_price'] == exchange_1:
        return ((df['close_' + exchange_1] / df['close_' + exchange_2])-1)*100
    elif df['higher_closing_price'] == exchange_2:
        return ((df['close_' + exchange_2] / df['close_' + exchange_1])-1)*100
    else:
        return 0
    
def get_target(df, exchange_1, exchange_2):
    if df['pct_higher'] < .55:
        return 'no_arbitrage'
    elif df['higher_closing_price'] == exchange_1:
        return 'arbitrage_' + exchange_2 + '_to_' + exchange_1
    elif df['higher_closing_price'] == exchange_2:
        return 'arbitrage_' + exchange_1 + '_to_' + exchange_2
        
def merge_dfs(df1, df2, exchange_1, exchange_2):
    df = pd.merge(df1, df2, on='closing_time',
                  suffixes=('_' + exchange_1, '_' + exchange_2))
    df['higher_closing_price'] = df.apply(get_higher_closing_price,
                                          exchange_1=exchange_1,
                                          exchange_2=exchange_2, axis=1)
    df['pct_higher'] = df.apply(get_pct_higher, exchange_1=exchange_1,
                                exchange_2=exchange_2, axis=1)
    df['target'] = df.apply(get_target, exchange_1=exchange_1,
                            exchange_2=exchange_2, axis=1)
    df = df.drop(columns=['higher_closing_price', 'pct_higher'])
    df['target'] = df['target'].shift(-1)
    df = df[0:-1]
    return df

In [None]:
def create_all_arbitrage_dfs():
    for pair in get_file_pairs():
        
        exchange_1 = pair[0].split('data/')[1].split('/')[0]
        df1 = get_df(pair[0])
        df1 = engineer_features(df1)

        exchange_2 = pair[1].split('data/')[1].split('/')[0]
        df2 = get_df(pair[1])
        df2 = engineer_features(df2)

        df = merge_dfs(df1, df2, exchange_1, exchange_2)
        
        end_of_file_extension = '_' + pair[1].split('data/')[1].split('/')[1]
        df.to_csv('arbitrage_data/' + exchange_1 + end_of_file_extension)

create_all_arbitrage_dfs()