In [None]:
import os

# the three exchanges we are using...
exchanges = ['bitfinex', 'coinbase_pro', 'gemini', 'hitbtc', 'kraken']

# function to get pairs of ohlcv csvs from which to create arbitrage data
def get_file_pairs(exchanges):
    # empty list to fill with filenames of all ohlcv csvs
    filenames = []
    # i.e., for subdirectory in ohlcv_data directory
    for directory in os.listdir('ohlcv_data'):
        # .DS_Store files can mess things up, since they aren't directories
        if directory != '.DS_Store':
            # for each of the files in the subdirectory...
            for filename in os.listdir('ohlcv_data/' + directory):
                # if the file is a csv...
                if filename.endswith('300.csv'):
                    # add the filename to the list of filenames
                    filenames.append(filename)
    # empty list to fill with pairs of csvs from which to make arbitrage data
    file_pairs = []
    # filename_1, because we will want to compare each filename to another
    for filename_1 in filenames:
        # these are all the filenames we haven't looped through yet
        remaining_filenames = filenames[filenames.index(filename_1)+1:]
        # for each of those filenames we haven't looped through yet...
        for filename_2 in remaining_filenames:
            # exchanges is a list taken as an argument by this function
            for exchange in exchanges:
                # drop the exchange from the first filename, see if the
                # remaining string is contained in the second filename
                if filename_1.replace(exchange, '') in filename_2:
                    # if so, add the pair of filenames to the list of pairs
                    file_pairs.append([filename_1, filename_2])
    # return the list of pairs
    return file_pairs

# getting the list of ohlcv csvs from which to create arbitrage data
get_file_pairs(exchanges)

In [None]:
import pandas as pd

# simple function to turn a csv into a dataframe
def get_df(filename):
    # index_col=0 because csv still has index
    df = pd.read_csv(filename, index_col=0)
    # returning the dataframe
    return df

In [None]:
# this function resamples ohlcv csvs for a specified candle interval; while 
# this can be used to change the candle interval for the data, it can also be
# used to fill in gaps in the ohlcv data without changing the candle interval
def resample_ohlcv(df, period='5T'):
    # set the date as the index; this is needed for the function to run
    df = df.set_index(['date'])
    # dictionary specifying which columns to use for resampling
    ohlc_dict = {                                                                                                             
    'open':'first',                                                                                                    
    'high':'max',                                                                                                       
    'low':'min',                                                                                                        
    'close': 'last',                                                                                                    
    'base_volume': 'sum'
    }
    # overwriting the df taken as input with a resampled df
    df = df.resample(period, how=ohlc_dict, closed='left', label='left')
    # returning the resampled df
    return df

In [None]:
from ta import add_all_ta_features

# function to handle nans in the data introduced by resampling
def fill_nan(df):
    # forward filling the closing price where there were gaps in ohlcv csv
    df['close'] = df['close'].ffill()
    # backfilling the rest of the nans
    df = df.bfill(axis=1)
    # returning the revised dataframe
    return df

# function to engineer features that can be engineered pre-merge...
def engineer_features(df):
    
    # turn the closing_time, which is in Unix time, to datetime...
    df['date'] = pd.to_datetime(df['closing_time'], unit='s')
    # ...which is needed for resampling; resampling fills gaps in data
    df = resample_ohlcv(df)
    # resetting the index
    df = df.reset_index()
    
    # now that df has been resampled, converting back to Unix time...
    # dividing by 1e9 to get seconds, not nanoseconds
    df['date'] = df['date'].astype('int64')//1e9
    # also changing name back to closing_time, to be more precise
    df = df.rename(columns={'date': 'closing_time'})
    
    # adding feature to indicate where rows are just filling gaps in data...
    df['nan_ohlcv'] = df['close'].apply(lambda x: 1 if pd.isnull(x) else 0)
    # now filling in the nan values in those gap-filling rows...
    df = fill_nan(df)
    
    # adding all the technical analysis features...
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close',
                             'base_volume', fillna=True)
    
    # technical analysis library converts some ints to floats; changing back
    df['closing_time'] = df['closing_time'].astype('int64')
    df['nan_ohlcv'] = df['nan_ohlcv'].astype('int64')
    
    # specifying features to keep; here we are dropping features that are 
    # highly correlated with other features
    df = df[['closing_time', 'open', 'high', 'low', 'close', 'base_volume',
             'nan_ohlcv', 'volume_adi', 'volume_obv', 'volume_cmf', 
             'volume_fi', 'volume_em', 'volume_vpt', 'volume_nvi', 
             'volatility_atr', 'volatility_bbhi', 'volatility_bbli',
             'volatility_kchi', 'volatility_kcli', 'volatility_dchi', 
             'volatility_dcli', 'trend_macd_signal', 'trend_macd_diff',
             'trend_adx', 'trend_adx_pos', 'trend_adx_neg', 
             'trend_vortex_ind_pos', 'trend_vortex_ind_neg',
             'trend_vortex_diff', 'trend_trix', 'trend_mass_index',
             'trend_cci', 'trend_dpo', 'trend_kst_sig', 'trend_kst_diff',
             'trend_aroon_up', 'trend_aroon_down', 'trend_aroon_ind',
             'momentum_rsi', 'momentum_mfi', 'momentum_tsi', 'momentum_uo',
             'momentum_stoch_signal', 'momentum_wr', 'momentum_ao',
             'others_dr']]
    
    # returning resulting dataframe
    return df

In [None]:
# the following functions are used in engineering features post-merge...

# function to create column showing which exchange has a higher closing price
def get_higher_closing_price(df):
    # i.e., if exchange 1 has the higher closing price...
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        # return exchange 1
        return 1
    # otherwise, if exchange 2 has the higher closing price...
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        # return exchange 2
        return 2
    # otherwise, i.e., if neither has a higher closing price...
    else:
        # return equivalent
        return 0

# function to create column showing percentage by which higher price is higher
def get_pct_higher(df):
    # i.e., if exchange 1 has a higher closing price than exchange 2...
    if df['higher_closing_price'] == 1:
        # return the percentage by which the exchange 1 closing price is 
        # greater than the exchange 2 closing price
        return ((df['close_exchange_1'] / 
                 df['close_exchange_2'])-1)*100
    # otherwise, if exchange 2 has a higher closing price than exchange 1...
    elif df['higher_closing_price'] == 2:
        # return the percentage by which the exchange 2 closing price is
        # greater than the exchange 1 closing price
        return ((df['close_exchange_2'] / 
                 df['close_exchange_1'])-1)*100
    # otherwise, i.e., if the closing prices are equivalent...
    else:
        # return zero
        return 0

# function to create column showing available arbitrage opportunities
def get_arbitrage_opportunity(df):
    # assuming the total fees are 0.55%, if the higher closing price is less
    # than 0.55% higher than the lower closing price...
    if df['pct_higher'] < .55:
        # return 0, for no arbitrage
        return 0
    # otherwise, if the exchange 1 closing price is more than 0.55% higher
    # than the exchange 2 closing price...
    elif df['higher_closing_price'] == 1:
        # return -1, for arbitrage from exchange 2 to exchange 1
        return -1
    # otherwise, if the exchange 2 closing price is more than 0.55% higher
    # than the exchange 1 closing price...
    elif df['higher_closing_price'] == 2:
        # return 1, for arbitrage from exchange 1 to exchange 2
        return 1
    
# function to create column showing how long arbitrage opportunity has lasted
def get_window_length(df):
    # converting arbitrage_opportunity column to a list...
    target_list = df['arbitrage_opportunity'].to_list()
    # setting initial window length to 5, for 5 minutes; will be updated...
    window_length = 5
    # creating empty list to fill with values and ultimately convert to column
    window_lengths = []
    # for i in the range of the length of the arbitrage_opportunity column...
    for i in range(len(target_list)):
        # if a value in the arbitrage_opportunity column is equal to the
        # previous value in the arbitrage_opportunity column...
        if target_list[i] == target_list[i-1]:
            # increase the window length by five minutes...
            window_length += 5
            # and append that window length to the list.
            window_lengths.append(window_length)
        # otherwise, i.e., if a value in the arbitrage_opportunity column is
        # not equal to the previous value in the arbitrage_opportunity column
        else:
            # reset the window length to five minutes...
            window_length = 5
            # and append that window length to the list
            window_lengths.append(window_length)
    # convert the window lengths list to a column, showing how long arbitrage
    # window / no_arbitrage window has lasted.
    df['window_length'] = window_lengths
    # return the dataframe with the new window length column
    return df
        
# function to merge dataframes and create final features for arbitrage data
def merge_dfs(df1, df2):
    # merging two modified ohlcv dfs on closing time to create arbitrage df
    df = pd.merge(df1, df2, on='closing_time',
                  suffixes=('_exchange_1', '_exchange_2'))
    
    # feature engineering year, month, and day columns
    df['year'] = pd.to_datetime(df['closing_time'], unit='s').dt.year
    df['month'] = pd.to_datetime(df['closing_time'], unit='s').dt.month
    df['day'] = pd.to_datetime(df['closing_time'], unit='s').dt.day

    # getting higher_closing_price feature to create pct_higher feature
    df['higher_closing_price'] = df.apply(get_higher_closing_price, axis=1)
    # getting pct_higher feature to create arbitrage_opportunity feature
    df['pct_higher'] = df.apply(get_pct_higher, axis=1)
    # getting arbitrage_opportunity feature
    df['arbitrage_opportunity'] = df.apply(get_arbitrage_opportunity, axis=1)
    # getting window_length feature
    df = get_window_length(df)
    # returning df
    return df

In [None]:
# creating target column...

# specifying arbitrage window length to target, in minutes
interval=30

# function to get target values; takes df and window length to target
def get_target_value(df, interval=interval):
    # i.e., if the coming arbitrage window is as long as the targeted interval
    if df['window_length_shift'] >= interval:
        # then if the coming arbitrage window is for exchange 1 to 2...
        if df['arbitrage_opportunity_shift'] == 1:
            # return 1, which means arbitrage from exchange 1 to 2
            return 1
        # otherwise, if the coming arbitrage window is for exchange 2 to 1...
        elif df['arbitrage_opportunity_shift'] == -1:
            # return -1, which means arbitrage from exchange 2 to 1...
            return -1
        # otherwise, if we are coming up on no arbitrage opportunity...
        elif df['arbitrage_opportunity_shift'] == 0:
            # return 0, which means no arbitrage opportunity
            return 0
    # otherwise, i.e., if the coming window is less than our targeted interval
    else:
        # return 0, which means no arbitrage opportunity
        return 0
    
# function to create target column
def get_target(df, interval=interval):
    # used to shift rows; assumes candle length is five minutes, interval is
    # in minutes
    rows_to_shift = int(-1*(interval/5))
    # arbitrage_opportunity feature, shifted by length of targeted interval,
    # minus one to predict ten minutes in advance rather than five
    df['arbitrage_opportunity_shift'] = df['arbitrage_opportunity'].shift(
        rows_to_shift - 1)
    # window_length feature, shifted by length of targeted interval, minus one
    # to predict ten minutes in advance rather than five
    df['window_length_shift'] = df['window_length'].shift(rows_to_shift - 1)
    # creating target column; this will indicate if an arbitrage opportunity
    # that lasts as long as the targeted interval is forthcoming
    df['target'] = df.apply(get_target_value, axis=1)
    # dropping rows where target could not be calculated due to shift
    df = df[:rows_to_shift - 1]
    # returning resulting dataframe
    return df

In [None]:
# defining functions needed to calculate profit...
    
# function to create new features out of closing prices, shifting those
# prices by the targeted interval, minus one to predict ten minutes in advance
# rather than five
def get_close_shift(df, interval=interval):
    rows_to_shift = int(-1*(interval/5))
    df['close_exchange_1_shift'] = df['close_exchange_1'].shift(
        rows_to_shift - 1)
    df['close_exchange_2_shift'] = df['close_exchange_2'].shift(
        rows_to_shift - 1)
    return df

# function to create profit feature
def get_profit(df):
    # if exchange 1 has the higher closing price...
    if df['higher_closing_price'] == 1:
        # see how much money you would make if you bought on exchange 2, sold
        # on exchange 1, and took account of 0.55% fees
        return (((df['close_exchange_1_shift'] / 
                 df['close_exchange_2'])-1)*100)-.55
    # otherwise, if exchange 2 has the higher closing price...
    elif df['higher_closing_price'] == 2:
        # see how much money you would make if you bought on exchange 1, sold
        # on exchange 2, and took account of 0.55% fees
        return (((df['close_exchange_2_shift'] / 
                 df['close_exchange_1'])-1)*100)-.55
    # otherwise, i.e., if the closing prices are the same...
    else:
        # return zero, because in that case you shouldn't make a trade
        return 0

In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# now the master function that creates models from ohlcv csvs...
def create_all_arbitrage_dfs_and_models(exchanges):
    # looping through the file pairs used to generate the arbitrage data...
    for pair in get_file_pairs(exchanges):
        # looping through the specified exchanges...
        for exchange in exchanges:
            # if one of the specified exchanges is in the first filename...
            if exchange in pair[0]:
                # that is the first exchange;
                exchange_1 = exchange
            # if one of the specified exchanges is in the second filename...
            if exchange in pair[1]:
                # that is the second exchange.
                exchange_2 = exchange
        
        # loading first ohlcv csv in pair...
        df1 = get_df('ohlcv_data/' + exchange_1 + '_300/' + pair[0])
        # engineering features for first ohlcv csv...
        print('engineering df1...')
        df1 = engineer_features(df1)
        print('success!')

        # loading second ohlcv csv in pair...
        df2 = get_df('ohlcv_data/' + exchange_2 + '_300/' + pair[1])
        # engineering features for second ohlcv csv...
        print('engineering df2...')
        df2 = engineer_features(df2)
        print('success!')

        # merging two ohlcv dataframes with their engineered features
        print('merging df1 and df2...')
        df = merge_dfs(df1, df2)
        print('success!')
        
        # getting the second half of the filename for the eventual model...
        end_of_model_name = '_' + pair[1].replace('_300.csv', '.pkl')
        # assembling whole of the filename for the eventual model...
        model_name = exchange_1 + end_of_model_name
        # printing the model name to track progress...
        print(model_name.replace('.pkl', '').upper())
        
        # getting the target feature
        df = get_target(df)
        
        # where to split df for 70/30 test/train split...
        test_train_split_row = round(len(df)*.7)
        # getting closing time for row at which test/train split is made...
        test_train_split_time = df['closing_time'][test_train_split_row]

        # subtracting one week from that closing time for training data...
        train_cutoff_time = test_train_split_time - 604800
        # adding one week to that closing time for test data...
        test_cutoff_time = test_train_split_time + 604800
        # used to ensure we have a two week gap between test and train data
        
        # training set will end one week before the 7/10th row in dataframe
        train = df[df['closing_time'] < train_cutoff_time]
        # test set will begin one week after the 7/10th row in dataframe
        test = df[df['closing_time'] > test_cutoff_time]
        # printing shapes to track progress
        print('train and test shape:'.format(model=model_name), 
              train.shape, test.shape)

        # specifying features for model to use; not using open, high, or
        # low, which are highly correlated with close and do not improve
        # model performance
        features = ['closing_time', 'close_exchange_1',
                    'base_volume_exchange_1', 'nan_ohlcv_exchange_1',
                    'volume_adi_exchange_1', 'volume_obv_exchange_1',
                    'volume_cmf_exchange_1', 'volume_fi_exchange_1',
                    'volume_em_exchange_1', 'volume_vpt_exchange_1',
                    'volume_nvi_exchange_1', 'volatility_atr_exchange_1',
                    'volatility_bbhi_exchange_1', 
                    'volatility_bbli_exchange_1', 
                    'volatility_kchi_exchange_1', 
                    'volatility_kcli_exchange_1',
                    'volatility_dchi_exchange_1',
                    'volatility_dcli_exchange_1',
                    'trend_macd_signal_exchange_1', 
                    'trend_macd_diff_exchange_1', 'trend_adx_exchange_1',
                    'trend_adx_pos_exchange_1', 'trend_adx_neg_exchange_1',
                    'trend_vortex_ind_pos_exchange_1', 
                    'trend_vortex_ind_neg_exchange_1', 
                    'trend_vortex_diff_exchange_1', 'trend_trix_exchange_1',
                    'trend_mass_index_exchange_1', 'trend_cci_exchange_1',
                    'trend_dpo_exchange_1', 'trend_kst_sig_exchange_1',
                    'trend_kst_diff_exchange_1', 'trend_aroon_up_exchange_1',
                    'trend_aroon_down_exchange_1',
                    'trend_aroon_ind_exchange_1',
                    'momentum_rsi_exchange_1', 'momentum_mfi_exchange_1',
                    'momentum_tsi_exchange_1', 'momentum_uo_exchange_1',
                    'momentum_stoch_signal_exchange_1',
                    'momentum_wr_exchange_1', 'momentum_ao_exchange_1',
                    'others_dr_exchange_1', 'close_exchange_2',
                    'base_volume_exchange_2', 'nan_ohlcv_exchange_2',
                    'volume_adi_exchange_2', 'volume_obv_exchange_2',
                    'volume_cmf_exchange_2', 'volume_fi_exchange_2',
                    'volume_em_exchange_2', 'volume_vpt_exchange_2',
                    'volume_nvi_exchange_2', 'volatility_atr_exchange_2',
                    'volatility_bbhi_exchange_2', 
                    'volatility_bbli_exchange_2',
                    'volatility_kchi_exchange_2',
                    'volatility_kcli_exchange_2',
                    'volatility_dchi_exchange_2',
                    'volatility_dcli_exchange_2',
                    'trend_macd_signal_exchange_2',
                    'trend_macd_diff_exchange_2', 'trend_adx_exchange_2',
                    'trend_adx_pos_exchange_2', 'trend_adx_neg_exchange_2',
                    'trend_vortex_ind_pos_exchange_2',
                    'trend_vortex_ind_neg_exchange_2',
                    'trend_vortex_diff_exchange_2', 'trend_trix_exchange_2',
                    'trend_mass_index_exchange_2', 'trend_cci_exchange_2',
                    'trend_dpo_exchange_2', 'trend_kst_sig_exchange_2',
                    'trend_kst_diff_exchange_2', 'trend_aroon_up_exchange_2',
                    'trend_aroon_down_exchange_2',
                    'trend_aroon_ind_exchange_2',
                    'momentum_rsi_exchange_2', 'momentum_mfi_exchange_2',
                    'momentum_tsi_exchange_2', 'momentum_uo_exchange_2',
                    'momentum_stoch_signal_exchange_2',
                    'momentum_wr_exchange_2', 'momentum_ao_exchange_2',
                    'others_dr_exchange_2', 'year', 'month', 'day',
                    'higher_closing_price', 'pct_higher', 
                    'arbitrage_opportunity', 'window_length']
        # specifying name of target column
        target = 'target'

        # separating features from target
        X_train = train[features]
        X_test = test[features]
        y_train = train[target]
        y_test = test[target]
        
        # defining model
        model = RandomForestClassifier(max_depth=75, n_estimators=100, 
                                       n_jobs=-1, random_state=42)
        
        # i.e., provided we have enough data to train on, and for testing...
        if (X_train.shape[0] > 1000) and (X_test.shape[0] > 0):
            # fitting the model...
            model.fit(X_train, y_train)
            print('model fitted!')
            # getting accuracy score for train set...
            train_score = model.score(X_train, y_train)
            print('train accuracy:', train_score)
            # making predictions...
            y_preds = model.predict(X_test)
            print('predictions made!')
            # getting accuracy score for test set...
            score = accuracy_score(y_test, y_preds)
            print('test accuracy:', score)

            # saving the model...
            pickle.dump(model, open('pickles/{model}.pkl'.format(
                model=model_name), 'wb'))
            print('pickle saved!'.format(model=model) + '\n')
                
            # getting labels for confusion matrix...
            unique_y_test = y_test.unique().tolist()
            unique_y_preds = list(set(y_preds))
            labels = list(set(unique_y_test + unique_y_preds))
            labels.sort()
            columns = [f'Predicted {label}' for label in labels]
            index = [f'Actual {label}'  for label in labels]
            # creating and printing confusion matrix...
            confusion = pd.DataFrame(confusion_matrix(y_test, y_preds),
                                     columns=columns, index=index)
            print(model_name + ' confusion matrix:')
            print(confusion, '\n')
                
            # creating dataframe from test set to calculate profitability
            test_with_preds = X_test
            # adding column with higher closing price...
            test_with_preds['higher_closing_price'
                           ] = test_with_preds.apply(
                get_higher_closing_price, axis=1)
            # adding column with shifted closing prices...
            test_with_preds = get_close_shift(test_with_preds)
            # adding column with predictions
            test_with_preds['pred'] = y_preds
            # adding column with profitability of predictions
            test_with_preds['pct_profit'] = test_with_preds.apply(
                get_profit, axis=1).shift(-1)
            # filtering out rows where no arbitrage is predicted
            test_with_preds = test_with_preds[test_with_preds['pred'] != 0]
            # calculating mean profit where arbitrage predicted...
            pct_profit_mean = test_with_preds['pct_profit'].mean()
            # calculating median profit where arbitrage predicted...
            pct_profit_median = test_with_preds['pct_profit'].median()
            print('percent profit mean:', pct_profit_mean)
            print('percent profit median:', pct_profit_median, '\n\n')

        # i.e., if there are less than 1000 rows on which to train...
        else:
            print('not enough data!'.format(model=model_name))

# creating all the arbitrage dfs and models from the ohlcv data...
create_all_arbitrage_dfs_and_models(exchanges)