In [None]:
import os

# the three exchanges we are using...
exchanges = ['bitfinex', 'coinbase_pro', 'hitbtc']

# function to get pairs of ohlcv csvs from which to create arbitrage data
def get_file_pairs(exchanges):
    # empty list to fill with filenames of all ohlcv csvs
    filenames = []
    # i.e., for subdirectory in ohlcv_data directory
    for directory in os.listdir('ohlcv_data'):
        # .DS_Store files can mess things up, since they aren't directories
        if directory != '.DS_Store':
            # for each of the files in the subdirectory...
            for filename in os.listdir('ohlcv_data/' + directory):
                # if the file is a csv...
                if filename.endswith('300.csv'):
                    # add the filename to the list of filenames
                    filenames.append(filename)
    # empty list to fill with pairs of csvs from which to make arbitrage data
    file_pairs = []
    # filename_1, because we will want to compare each filename to another
    for filename_1 in filenames:
        # these are all the filenames we haven't looped through yet
        remaining_filenames = filenames[filenames.index(filename_1)+1:]
        # for each of those filenames we haven't looped through yet...
        for filename_2 in remaining_filenames:
            # exchanges is a list taken as an argument by this function
            for exchange in exchanges:
                # drop the exchange from the first filename, see if the
                # remaining string is contained in the second filename
                if filename_1.replace(exchange, '') in filename_2:
                    # if so, add the pair of filenames to the list of pairs
                    file_pairs.append([filename_1, filename_2])
    # return the list of pairs
    return file_pairs

# getting the list of ohlcv csvs from which to create arbitrage data
get_file_pairs(exchanges)

In [None]:
import pandas as pd

# simple function to turn a csv into a dataframe
def get_df(filename):
    # index_col=0 because csv still has index
    df = pd.read_csv(filename, index_col=0)
    # returning the dataframe
    return df

In [None]:
# this function resamples ohlcv csvs for a specified candle interval; while 
# this can be used to change the candle interval for the data, it can also be
# used to fill in gaps in the ohlcv data without changing the candle interval
def resample_ohlcv(df, period='5T'):
    # set the date as the index; this is needed for the function to run
    df = df.set_index(['date'])
    # dictionary specifying which columns to use for resampling
    ohlc_dict = {                                                                                                             
    'open':'first',                                                                                                    
    'high':'max',                                                                                                       
    'low':'min',                                                                                                        
    'close': 'last',                                                                                                    
    'base_volume': 'sum'
    }
    # overwriting the df taken as input with a resampled df
    df = df.resample(period, how=ohlc_dict, closed='left', label='left')
    # returning the resampled df
    return df

In [None]:
from ta import add_all_ta_features

# function to handle nans in the data introduced by resampling
def fill_nan(df):
    # forward filling the closing price where there were gaps in ohlcv csv
    df['close'] = df['close'].ffill()
    # backfilling the rest of the nans
    df = df.bfill(axis=1)
    # returning the revised dataframe
    return df

# function to engineer features that can be engineered pre-merge...
def engineer_features(df):
    
    # turn the closing_time, which is in Unix time, to datetime...
    df['date'] = pd.to_datetime(df['closing_time'], unit='s')
    # ...which is needed for resampling; resampling fills gaps in data
    df = resample_ohlcv(df)
    # resetting the index
    df = df.reset_index()
    
    # now that df has been resampled, converting back to Unix time...
    # dividing by 1e9 to get seconds, not nanoseconds
    df['date'] = df['date'].astype('int64')//1e9
    # also changing name back to closing_time, to be more precise
    df = df.rename(columns={'date': 'closing_time'})
    
    # adding feature to indicate where rows are just filling gaps in data...
    df['nan_ohlcv'] = df['close'].apply(lambda x: 1 if pd.isnull(x) else 0)
    # now filling in the nan values in those gap-filling rows...
    df = fill_nan(df)
    
    # adding all the technical analysis features...
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close',
                             'base_volume', fillna=True)
    
    # technical analysis library converts some ints to floats; changing back
    df['closing_time'] = df['closing_time'].astype('int64')
    df['nan_ohlcv'] = df['nan_ohlcv'].astype('int64')
    
    # dropping features that are highly correlated with other features
    df = df.drop(columns=['open', 'high', 'low', 'momentum_kama',
                          'momentum_stoch', 'others_cr', 'others_dlr',
                          'trend_ema_fast', 'trend_ema_slow', 
                          'trend_ichimoku_a', 'trend_ichimoku_b', 'trend_kst',
                          'trend_macd', 'trend_visual_ichimoku_a',
                          'trend_visual_ichimoku_b', 'volatility_bbh',
                          'volatility_bbl', 'volatility_bbm',
                          'volatility_dch', 'volatility_dcl',
                          'volatility_kcc', 'volatility_kch',
                          'volatility_kcl'])
    
    # returning resulting dataframe
    return df

In [None]:
# the following functions are used in engineering features post-merge...

# function to create column showing which exchange has a higher closing price
def get_higher_closing_price(df):
    # i.e., if exchange 1 has the higher closing price...
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        # return exchange 1
        return 'exchange_1'
    # otherwise, if exchange 2 has the higher closing price...
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        # return exchange 2
        return 'exchange_2'
    # otherwise, i.e., if neither has a higher closing price...
    else:
        # return equivalent
        return 'equivalent'

# function to create column showing percentage by which higher price is higher
def get_pct_higher(df):
    # i.e., if exchange 1 has a higher closing price than exchange 2...
    if df['higher_closing_price'] == 'exchange_1':
        # return the percentage by which the exchange 1 closing price is 
        # greater than the exchange 2 closing price
        return ((df['close_exchange_1'] / 
                 df['close_exchange_2'])-1)*100
    # otherwise, if exchange 2 has a higher closing price than exchange 1...
    elif df['higher_closing_price'] == 'exchange_2':
        # return the percentage by which the exchange 2 closing price is
        # greater than the exchange 1 closing price
        return ((df['close_exchange_2'] / 
                 df['close_exchange_1'])-1)*100
    # otherwise, i.e., if the closing prices are equivalent...
    else:
        # return zero
        return 0

# function to create column showing available arbitrage opportunities
def get_arbitrage_opportunity(df):
    # assuming the total fees are 0.55%, if the higher closing price is less
    # than 0.55% higher than the lower closing price...
    if df['pct_higher'] < .55:
        # return 0, for no arbitrage
        return 0
    # otherwise, if the exchange 1 closing price is more than 0.55% higher
    # than the exchange 2 closing price...
    elif df['higher_closing_price'] == 'exchange_1':
        # return -1, for arbitrage from exchange 2 to exchange 1
        return -1
    # otherwise, if the exchange 2 closing price is more than 0.55% higher
    # than the exchange 1 closing price...
    elif df['higher_closing_price'] == 'exchange_2':
        # return 1, for arbitrage from exchange 1 to exchange 2
        return 1
    
# function to create column showing how long arbitrage opportunity has lasted
def get_window_length(df):
    # converting arbitrage_opportunity column to a list...
    target_list = df['arbitrage_opportunity'].to_list()
    # setting initial window length to 5, for 5 minutes; will be updated...
    window_length = 5
    # creating empty list to fill with values and ultimately convert to column
    window_lengths = []
    # for i in the range of the length of the arbitrage_opportunity column...
    for i in range(len(target_list)):
        # if a value in the arbitrage_opportunity column is equal to the
        # previous value in the arbitrage_opportunity column...
        if target_list[i] == target_list[i-1]:
            # increase the window length by five minutes...
            window_length += 5
            # and append that window length to the list.
            window_lengths.append(window_length)
        # otherwise, i.e., if a value in the arbitrage_opportunity column is
        # not equal to the previous value in the arbitrage_opportunity column
        else:
            # reset the window length to five minutes...
            window_length = 5
            # and append that window length to the list
            window_lengths.append(window_length)
    # convert the window lengths list to a column, showing how long arbitrage
    # window / no_arbitrage window has lasted.
    df['window_length'] = window_lengths
    # return the dataframe with the new window length column
    return df
        
# function to merge dataframes and create final features for arbitrage data
def merge_dfs(df1, df2):
    # merging two modified ohlcv dfs on closing time to create arbitrage df
    df = pd.merge(df1, df2, on='closing_time',
                  suffixes=('_exchange_1', '_exchange_2'))
    
    # feature engineering year, month, and day columns
    df['year'] = pd.to_datetime(df['closing_time'], unit='s').dt.year
    df['month'] = pd.to_datetime(df['closing_time'], unit='s').dt.month
    df['day'] = pd.to_datetime(df['closing_time'], unit='s').dt.day

    # getting higher_closing_price feature to create pct_higher feature
    df['higher_closing_price'] = df.apply(get_higher_closing_price, axis=1)
    # getting pct_higher feature to create arbitrage_opportunity feature
    df['pct_higher'] = df.apply(get_pct_higher, axis=1)
    # getting arbitrage_opportunity feature
    df['arbitrage_opportunity'] = df.apply(get_arbitrage_opportunity, axis=1)
    # getting window_length feature
    df = get_window_length(df)
    # dropping higher_closing_price and pct_higher features, which were
    # only needed to feature engineer arbitrage_opportunity and window_length
    df = df.drop(columns=['higher_closing_price', 'pct_higher'])
    # returning df
    return df

In [None]:
# now the function that creates arbitrage data csvs from ohlcv csvs...
def create_all_arbitrage_csvs(exchanges):
    # looping through the file pairs used to generate the arbitrage data...
    for pair in get_file_pairs(exchanges):
        # looping through the specified exchanges...
        for exchange in exchanges:
            # if one of the specified exchanges is in the first filename...
            if exchange in pair[0]:
                # that is the first exchange;
                exchange_1 = exchange
            # if one of the specified exchanges is in the second filename...
            if exchange in pair[1]:
                # that is the second exchange.
                exchange_2 = exchange
        
        # loading first ohlcv csv in pair...
        df1 = get_df('ohlcv_data/' + exchange_1 + '_300/' + pair[0])
        # engineering features for first ohlcv csv...
        print('engineering df1...')
        df1 = engineer_features(df1)
        print('success!')

        # loading second ohlcv csv in pair...
        df2 = get_df('ohlcv_data/' + exchange_2 + '_300/' + pair[1])
        # engineering features for second ohlcv csv...
        print('engineering df2...')
        df2 = engineer_features(df2)
        print('success!')

        # merging two ohlcv dataframes with their engineered features
        print('merging df1 and df2...')
        df = merge_dfs(df1, df2)
        print('success!')
        
        # getting the second half of the filename for the csv...
        end_of_filename = '_' + pair[1].replace('_300', '')
        # assembling whole of the filename for the csv...
        filename = exchange_1 + end_of_filename
        print('saving...')
        # saving csv
        df.to_csv('arbitrage_data/' + filename)
        print('saved ' + filename + '!')

# creating all the arbitrage csvs from the ohlcv data...
create_all_arbitrage_csvs(exchanges)