In [1]:
import pandas as pd

# the following is used to create target values from arbitrage data csvs...

# specifying arbitrage window length to target, in minutes
interval=30

# function to get target values; takes df and window length to target
def get_target_value(df, interval=interval):
    # i.e., if the coming arbitrage window is as long as the targeted interval
    if df['window_length_shift'] >= interval:
        # then if the coming arbitrage window is for exchange 1 to 2...
        if df['arbitrage_opportunity_shift'] == 1:
            # return 1, which means arbitrage from exchange 1 to 2
            return 1
        # otherwise, if the coming arbitrage window is for exchange 2 to 1...
        elif df['arbitrage_opportunity_shift'] == -1:
            # return -1, which means arbitrage from exchange 2 to 1...
            return -1
        # otherwise, if we are coming up on no arbitrage opportunity...
        elif df['arbitrage_opportunity_shift'] == 0:
            # return 0, which means no arbitrage opportunity
            return 0
    # otherwise, i.e., if the coming window is less than our targeted interval
    else:
        # return 0, which means no arbitrage opportunity
        return 0
    
# function to create target column
def get_target(df, interval=interval):
    # used to shift rows; assumes candle length is five minutes, interval is
    # in minutes
    rows_to_shift = int(-1*(interval/5))
    # arbitrage_opportunity feature, shifted by length of targeted interval
    df['arbitrage_opportunity_shift'] = df['arbitrage_opportunity'].shift(
        rows_to_shift)
    # window_length feature, shifted by length of targeted interval
    df['window_length_shift'] = df['window_length'].shift(rows_to_shift)
    # creating target column; this will indicate if an arbitrage opportunity
    # that lasts as long as the targeted interval is forthcoming
    df['target'] = df.apply(get_target_value, axis=1)
    # dropping unncessary columns, which were only needed to engineer target
    df = df.drop(columns=['window_length_shift',
                          'arbitrage_opportunity_shift'])
    # dropping rows where target could not be calculated due to shift
    df = df[:rows_to_shift]
    # returning resulting dataframe
    return df

In [2]:
# defining functions needed to calculate profit...

# function to create column showing which exchange has a higher closing price
def get_higher_closing_price(df):
    # i.e., if exchange 1 has the higher closing price...
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        # return exchange 1
        return 'exchange_1'
    # otherwise, if exchange 2 has the higher closing price...
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        # return exchange 2
        return 'exchange_2'
    # otherwise, i.e., if neither has a higher closing price...
    else:
        # return equivalent
        return 'equivalent'
        
# function to create new features out of closing prices, shifting those
# prices by the targeted interval
def get_close_shift(df, interval=interval):
    rows_to_shift = int(-1*(interval/5))
    df['close_exchange_1_shift'] = df['close_exchange_1'].shift(rows_to_shift)
    df['close_exchange_2_shift'] = df['close_exchange_2'].shift(rows_to_shift)
    return df

# function to create profit feature
def get_profit(df):
    # if exchange 1 has the higher closing price...
    if df['higher_closing_price'] == 'exchange_1':
        # see how much money you would make if you bought on exchange 2, sold
        # on exchange 1, and took account of 0.55% fees
        return (((df['close_exchange_1_shift'] / 
                 df['close_exchange_2'])-1)*100)-.55
    # otherwise, if exchange 2 has the higher closing price...
    elif df['higher_closing_price'] == 'exchange_2':
        # see how much money you would make if you bought on exchange 1, sold
        # on exchange 2, and took account of 0.55% fees
        return (((df['close_exchange_2_shift'] / 
                 df['close_exchange_1'])-1)*100)-.55
    # otherwise, i.e., if the closing prices are the same...
    else:
        # return zero, because in that case you shouldn't make a trade
        return 0

In [None]:
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def create_all_arbitrage_models():
    # for each of the files in the arbitrage_data directory
    for filename in os.listdir('arbitrage_data'):
        # if the file is a csv...
        if filename.endswith('.csv'):
            # getting the filename for the eventual model...
            model_name = filename.replace('.csv', '_rfc')
            print(model_name.upper())
            # loading arbitrage data csv
            df = pd.read_csv('arbitrage_data/' + filename, index_col=0)

            # getting the target feature
            df = get_target(df)

            # where to split df for 70/30 test/train split...
            test_train_split_row = round(len(df)*.7)
            # getting closing time for row at which test/train split is made
            test_train_split_time = df['closing_time'][test_train_split_row]

            # subtracting one week from that closing time for training data...
            train_cutoff_time = test_train_split_time - 604800
            # adding one week to that closing time for test data...
            test_cutoff_time = test_train_split_time + 604800
            # used to ensure we have two week gap between test and train data

            # training set ends one week before the 7/10th row in dataframe
            train = df[df['closing_time'] < train_cutoff_time]
            # test set begins one week after the 7/10th row in dataframe
            test = df[df['closing_time'] > test_cutoff_time]
            # printing shapes to track progress
            print('train and test shape:'.format(model=model_name), 
                  train.shape, test.shape)

            # model uses all features; only dropping target
            features = df.drop(columns=['target']).columns.tolist()
            # specifying name of target column
            target = 'target'

            # separating features from target
            X_train = train[features]
            X_test = test[features]
            y_train = train[target]
            y_test = test[target]

            # defining model
            model = RandomForestClassifier(max_depth=75, n_estimators=100, 
                                           n_jobs=-1, random_state=42)

            # i.e., provided we have enough data to train on...
            if X_train.shape[0] > 1000:
                # fitting the model...
                model.fit(X_train, y_train)
                print('model fitted!')
                # getting accuracy score for train set...
                train_score = model.score(X_train, y_train)
                print('train accuracy:', train_score)
                # making predictions...
                y_preds = model.predict(X_test)
                print('predictions made!')
                # getting accuracy score for test set...
                score = accuracy_score(y_test, y_preds)
                print('test accuracy:', score)

                # saving the model...
                pickle.dump(model, open('pickles/{model}.pkl'.format(
                    model=model_name), 'wb'))
                print('pickle saved!'.format(model=model) + '\n')

                # getting labels for confusion matrix...
                unique_y_test = y_test.unique().tolist()
                unique_y_preds = list(set(y_preds))
                labels = list(set(unique_y_test + unique_y_preds))
                labels.sort()
                columns = [f'Predicted {label}' for label in labels]
                index = [f'Actual {label}'  for label in labels]
                # creating and printing confusion matrix...
                confusion = pd.DataFrame(confusion_matrix(y_test, y_preds),
                                         columns=columns, index=index)
                print(model_name + ' confusion matrix:')
                print(confusion, '\n')

                # creating dataframe from test set to calculate profitability
                test_with_preds = X_test
                # adding column with higher closing price...
                test_with_preds['higher_closing_price'
                               ] = test_with_preds.apply(
                    get_higher_closing_price, axis=1)
                # adding column with shifted closing prices...
                test_with_preds = get_close_shift(test_with_preds)
                # adding column with predictions
                test_with_preds['pred'] = y_preds
                # adding column with profitability of predictions
                test_with_preds['pct_profit'] = test_with_preds.apply(
                    get_profit, axis=1).shift(-1)
                # filtering out rows where no arbitrage is predicted
                test_with_preds = test_with_preds[
                    test_with_preds['pred'] != 0]
                # calculating mean profit where arbitrage predicted...
                pct_profit_mean = test_with_preds['pct_profit'].mean()
                # calculating median profit where arbitrage predicted...
                pct_profit_median = test_with_preds['pct_profit'].median()
                print('percent profit mean:', pct_profit_mean)
                print('percent profit median:', pct_profit_median, '\n\n')

            # i.e., if there are less than 1000 rows on which to train...
            else:
                print('not enough data!'.format(model=model_name))

# creating all the arbitrage models from the arbitrage data csvs...
create_all_arbitrage_models()

BITFINEX_HITBTC_BCH_BTC_RFC
train and test shape: (69788, 91) (28756, 91)
model fitted!
train accuracy: 1.0
predictions made!
test accuracy: 0.9983655584921408
pickle saved!

bitfinex_hitbtc_bch_btc_rfc confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1             0            4            0
Actual 0              0        28709            0
Actual 1              0           43            0 

percent profit mean: nan
percent profit median: nan 


BITFINEX_COINBASE_PRO_ZRX_USD_RFC
train and test shape: (76224, 91) (31514, 91)
model fitted!
train accuracy: 0.9999868807724601
predictions made!
test accuracy: 0.9655073935393793
pickle saved!

bitfinex_coinbase_pro_zrx_usd_rfc confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1             4          483            0
Actual 0            409        30423            0
Actual 1              5          190            0 

percent profit mean: -0.18817104210296223
percent profit median: -0.2802