In [181]:
import os

def get_file_pairs(exchanges):
    filenames = []
    for directory in os.listdir('ohlcv_data'):
        if directory != '.DS_Store':
            for filename in os.listdir('ohlcv_data/' + directory):
                if filename.endswith('300.csv'):
                     filenames.append(filename)
    file_pairs = []
    for filename_1 in filenames:
        remaining_filenames = filenames[filenames.index(filename_1)+1:]
        for filename_2 in remaining_filenames:
            for exchange in exchanges:
                if filename_1.replace(exchange, '') in filename_2:
                    file_pairs.append([filename_1, filename_2])
    return file_pairs

exchanges = ['bitfinex', 'coinbase_pro', 'hitbtc']
get_file_pairs(exchanges)

[['bitfinex_eos_usdt_300.csv', 'hitbtc_eos_usdt_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'coinbase_pro_bch_btc_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'hitbtc_bch_btc_300.csv'],
 ['bitfinex_etc_usd_300.csv', 'coinbase_pro_etc_usd_300.csv'],
 ['bitfinex_btc_usd_300.csv', 'coinbase_pro_btc_usd_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'coinbase_pro_ltc_btc_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'hitbtc_ltc_btc_300.csv'],
 ['bitfinex_dash_usd_300.csv', 'coinbase_pro_dash_usd_300.csv'],
 ['bitfinex_dash_btc_300.csv', 'coinbase_pro_dash_btc_300.csv'],
 ['bitfinex_dash_btc_300.csv', 'hitbtc_dash_btc_300.csv'],
 ['bitfinex_ltc_usd_300.csv', 'coinbase_pro_ltc_usd_300.csv'],
 ['bitfinex_bch_usdt_300.csv', 'hitbtc_bch_usdt_300.csv'],
 ['bitfinex_bch_usd_300.csv', 'coinbase_pro_bch_usd_300.csv'],
 ['bitfinex_eos_usd_300.csv', 'coinbase_pro_eos_usd_300.csv'],
 ['bitfinex_xrp_usd_300.csv', 'coinbase_pro_xrp_usd_300.csv'],
 ['bitfinex_eth_btc_300.csv', 'coinbase_pro_eth_btc_300.csv'],
 ['bitfinex_

In [182]:
import pandas as pd

def get_df(filename):
    df = pd.read_csv(filename, index_col=0)
    return df

In [183]:
def resample_ohlcv(df, period='5T'):

    # Set date as the index. This is needed for the function to run
    df = df.set_index(['date'])

    # Aggregation function
    ohlc_dict = {                                                                                                             
    'open':'first',                                                                                                    
    'high':'max',                                                                                                       
    'low':'min',                                                                                                        
    'close': 'last',                                                                                                    
    'base_volume': 'sum'
    }

    # Apply resampling.
    df = df.resample(period, how=ohlc_dict, closed='left', label='left')
    
    return df

In [184]:
from ta import add_all_ta_features

def fill_nan(df):
    
    df['close'] = df['close'].ffill()
    df = df.bfill(axis=1)

    return df

def engineer_features(df):
    
    df['date'] = pd.to_datetime(df['closing_time'], unit='s')
    df = resample_ohlcv(df)
    df = df.reset_index()
    
    df['date'] = df['date'].astype('int64')//1e9
    df = df.rename(columns={'date': 'closing_time'})
        
    df['nan_ohlcv'] = df['close'].apply(lambda x: 1 if pd.isnull(x) else 0)
    df = fill_nan(df)
        
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close',
                             'base_volume', fillna=True)
    
    df['closing_time'] = df['closing_time'].astype('int64')
    df['nan_ohlcv'] = df['nan_ohlcv'].astype('int64')
    
    df = df.drop(columns=['open', 'high', 'low', 'momentum_kama',
                          'momentum_stoch', 'others_cr', 'others_dlr',
                          'trend_ema_fast', 'trend_ema_slow', 
                          'trend_ichimoku_a', 'trend_ichimoku_b', 'trend_kst',
                          'trend_macd', 'trend_visual_ichimoku_a',
                          'trend_visual_ichimoku_b', 'volatility_bbh',
                          'volatility_bbl', 'volatility_bbm',
                          'volatility_dch', 'volatility_dcl',
                          'volatility_kcc', 'volatility_kch',
                          'volatility_kcl'])
       
    return df

In [185]:
def get_higher_closing_price(df):
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        return 'exchange_1'
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        return 'exchange_2'
    else:
        return 'equivalent'

def get_pct_higher(df):
    if df['higher_closing_price'] == 'exchange_1':
        return ((df['close_exchange_1'] / 
                 df['close_exchange_2'])-1)*100
    elif df['higher_closing_price'] == 'exchange_2':
        return ((df['close_exchange_2'] / 
                 df['close_exchange_1'])-1)*100
    else:
        return 0
    
def get_arbitrage_opportunity(df):
    if df['pct_higher'] < .55:
        return 0 # no arbitrage
    elif df['higher_closing_price'] == 'exchange_1':
        return -1 # arbitrage exchange 2 to exchange 1
    elif df['higher_closing_price'] == 'exchange_2':
        return 1 # arbitrage exchange 1 to exchange 2
    
def get_window_length(df):
    target_list = df['arbitrage_opportunity'].to_list()
    window_length = 5
    window_lengths = []
    for i in range(len(target_list)):
            if target_list[i] == target_list[i-1]:
                window_length += 5
                window_lengths.append(window_length)
            else:
                window_length = 5
                window_lengths.append(window_length)      
    df['window_length'] = window_lengths
    return df
        
def merge_dfs(df1, df2):
    df = pd.merge(df1, df2, on='closing_time',
                  suffixes=('_exchange_1', '_exchange_2'))
        
    df['year'] = pd.to_datetime(df['closing_time'], unit='s').dt.year
    df['month'] = pd.to_datetime(df['closing_time'], unit='s').dt.month
    df['day'] = pd.to_datetime(df['closing_time'], unit='s').dt.day

    df['higher_closing_price'] = df.apply(get_higher_closing_price, axis=1)
    df['pct_higher'] = df.apply(get_pct_higher, axis=1)
    df['arbitrage_opportunity'] = df.apply(get_arbitrage_opportunity, axis=1)
    df = get_window_length(df)
    df = df.drop(columns=['higher_closing_price', 'pct_higher'])
    df = df[:-8]
    return df

In [186]:
def create_all_arbitrage_csvs(exchanges):
    for pair in get_file_pairs(exchanges):
        for exchange in exchanges:
            if exchange in pair[0]:
                exchange_1 = exchange
            if exchange in pair[1]:
                exchange_2 = exchange
        
        df1 = get_df('ohlcv_data/' + exchange_1 + '_300/' + pair[0])
        print('engineering df1...')
        df1 = engineer_features(df1)
        print('success!')

        df2 = get_df('ohlcv_data/' + exchange_2 + '_300/' + pair[1])
        print('engineering df2...')
        df2 = engineer_features(df2)
        print('success!')

        print('merging df1 and df2...')
        df = merge_dfs(df1, df2)
        print('success!')
        
        end_of_file_extension = '_' + pair[1].replace('_300', '')
        print('saving...')
        df.to_csv('arbitrage_data_new/' + exchange_1 + end_of_file_extension)
        print('saved ' + exchange_1 + end_of_file_extension + '!')

exchanges = ['bitfinex', 'coinbase_pro', 'hitbtc']
create_all_arbitrage_csvs(exchanges)

the new syntax is .resample(...)..apply(<func>)
  app.launch_new_instance()
