In [1]:
import pandas as pd

interval=30

def get_target_value(df, interval=interval):

    if df['window_length_shift'] >= interval:
        if df['arbitrage_opportunity_shift'] == 1:
            return 1
        elif df['arbitrage_opportunity_shift'] == -1:
            return -1
        elif df['arbitrage_opportunity_shift'] == 0:
            return 0
    else:
        return 0
    
def get_target(df, interval=interval):
    
    rows_to_shift = int(-1*(interval/5))
    
    df['arbitrage_opportunity_shift'] = df['arbitrage_opportunity'].shift(
        rows_to_shift)
    df['window_length_shift'] = df['window_length'].shift(rows_to_shift)
    
    df['target'] = df.apply(get_target_value, axis=1)
    
    df = df.drop(columns=['window_length_shift',
                          'arbitrage_opportunity_shift'])
    
    return df

In [2]:
def get_higher_closing_price(df):
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        return 'exchange_1'
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        return 'exchange_2'
    else:
        return 'equivalent'
    
def get_close_shift(df, interval=interval):
    rows_to_shift = int(-1*(interval/5))
    df['close_exchange_1_shift'] = df['close_exchange_1'].shift(rows_to_shift)
    df['close_exchange_2_shift'] = df['close_exchange_2'].shift(rows_to_shift)
    return df

def get_profit(df):
    if df['higher_closing_price'] == 'exchange_1':
        return (((df['close_exchange_1_shift'] / 
                 df['close_exchange_2'])-1)*100)-.55
    elif df['higher_closing_price'] == 'exchange_2':
        return (((df['close_exchange_2_shift'] / 
                 df['close_exchange_1'])-1)*100)-.55
    else:
        return 0

In [3]:
import os
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def generate_all_models():
    for filename in os.listdir('arbitrage_data'):
        if filename.endswith('.csv'):
        
            name = filename.replace('.csv', '_rfc')
            print(name.upper())

            df = pd.read_csv('arbitrage_data/' + filename, index_col=0)

            df = get_target(df)

            test_train_split_row = round(len(df)*.7)
            test_train_split_time = df['closing_time'][test_train_split_row]

            train_cutoff_time = test_train_split_time - 604800
            test_cutoff_time = test_train_split_time + 604800

            train = df[df['closing_time'] < train_cutoff_time]
            test = df[df['closing_time'] > test_cutoff_time]
            print('train and test shape:'.format(model=name), 
                  train.shape, test.shape)

            features = df.drop(columns=['target']).columns.tolist()
            target = 'target'

            X_train = train[features]
            X_test = test[features]
            y_train = train[target]
            y_test = test[target]

            model = RandomForestClassifier(max_depth=75, n_estimators=100, 
                                           n_jobs=-1, random_state=42)

            if X_train.shape[0] > 1000:
                model.fit(X_train, y_train)
                print('model fitted!')

                train_score = model.score(X_train, y_train)
                print('train accuracy:', train_score)

                y_preds = model.predict(X_test)
                print('predictions made!')

                score = accuracy_score(y_test, y_preds)
                print('test accuracy:', score)

                pickle.dump(model, open('pickles/{model}.pkl'.format(
                    model=name), 'wb'))
                print('pickle saved!'.format(model=model) + '\n')
                
                unique_y_test = y_test.unique().tolist()
                unique_y_preds = list(set(y_preds))
                labels = list(set(unique_y_test + unique_y_preds))
                labels.sort()
                columns = [f'Predicted {label}' for label in labels]
                index = [f'Actual {label}'  for label in labels]
                
                confusion = pd.DataFrame(confusion_matrix(y_test, y_preds),
                                         columns=columns, index=index)
                
                print(name + ' confusion matrix:')
                print(confusion, '\n')
                
                test_with_preds = X_test
                test_with_preds['higher_closing_price'
                               ] = test_with_preds.apply(
                    get_higher_closing_price, axis=1)
                test_with_preds = get_close_shift(test_with_preds)
                test_with_preds['pred'] = y_preds
                test_with_preds['pct_profit'] = test_with_preds.apply(
                    get_profit, axis=1).shift(-1)
                
                pct_profit_mean = test_with_preds[test_with_preds[
                    'pred'] != 0]['pct_profit'].mean()
                pct_profit_median = test_with_preds[test_with_preds[
                    'pred'] != 0]['pct_profit'].median()
                
                print('percent profit mean:', pct_profit_mean)
                print('percent profit median:', pct_profit_median, '\n\n')

            else:
                print('not enough data!'.format(model=name))
            
generate_all_models()

BITFINEX_HITBTC_BCH_BTC_RFC
train and test shape: (69786, 91) (28756, 91)
model fitted!
train accuracy: 1.0
predictions made!
test accuracy: 0.9983655584921408
pickle saved!

bitfinex_hitbtc_bch_btc_rfc confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1             0            4            0
Actual 0              0        28709            0
Actual 1              0           43            0 

percent profit mean: nan
percent profit median: nan 


BITFINEX_COINBASE_PRO_ZRX_USD_RFC
train and test shape: (76222, 91) (31514, 91)
model fitted!
train accuracy: 0.9999868804282228
predictions made!
test accuracy: 0.9560512787967252
pickle saved!

bitfinex_coinbase_pro_zrx_usd_rfc confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1            10          477            0
Actual 0            713        30119            0
Actual 1              7          188            0 

percent profit mean: -0.17667761029114532
percent profit median: -0.2822

model fitted!
train accuracy: 1.0
predictions made!
test accuracy: 0.9977557508883486
pickle saved!

bitfinex_coinbase_pro_eos_usd_rfc confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1             0           14            0
Actual 0              0        16005            0
Actual 1              0           22            0 

percent profit mean: nan
percent profit median: nan 


COINBASE_PRO_HITBTC_ETH_BTC_RFC
train and test shape: (253138, 91) (107335, 91)
model fitted!
train accuracy: 0.9999960495856015
predictions made!
test accuracy: 1.0
pickle saved!

coinbase_pro_hitbtc_eth_btc_rfc confusion matrix:
          Predicted 0
Actual 0       107335 

percent profit mean: nan
percent profit median: nan 


BITFINEX_COINBASE_PRO_ETC_USD_RFC
train and test shape: (89286, 91) (37113, 91)
model fitted!
train accuracy: 1.0
predictions made!
test accuracy: 0.9682860453210466
pickle saved!

bitfinex_coinbase_pro_etc_usd_rfc confusion matrix:
           Predicted -1  