# Imports

In [5]:
!pip install scikit-learn==0.21.3



In [None]:
!pip install ta==0.4.7

In [None]:
!pip install pandas==0.25.1

In [20]:
import glob
import os
import pickle
import json
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import datetime as dt

from ta import add_all_ta_features

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

#### Background on modeling

Arbitrage models are generated by getting the combination of 2 exchanges that support the same trading pair, merging their available data, engineering features, and creating a target that signals an arbitrage opportunity. A valid arbitrage signal is when the arbitrage lasts >30 mins because it takes time to move coins from one exchange to the other in order to successfully complete the arbitrage trades. These functions will take over a day to run if not split up on separate notebooks. There are 95 total options for models, 75 of those options have enough data to train models, and with different options for parameters around ~7K models will be trained. After selecting for the best models there were 21 good ones included in this project

├── arbitrage/                        <-- The top-level directory for all arbitrage work
│   ├── arbitrage_models.ipynb        <-- notebook for arbitrage models
│   ├── all_data/                     <-- Directory with subdirectories containing 5 min candle data
│   │      ├──bitfinex_300/
│   │      │      └── data.csv
│   │      ├──coinbase_pro_300/
│   │      │      └── data.csv
│   │      ├──gemini_300/
│   │      │      └── data.csv
│   │      ├──hitbtc_300/
│   │      │      └── data.csv
│   │      └──kraken_300/
│   │             └── data.csv
│   ├── data/                         <-- Directory for csv files of 5 min candle data
│   │     └── data.csv                
│   ├── ta_data/                      <-- Directory for csv files of data after ta features engineered
│   │     └── data.csv                
│   ├── arb_data/                     <-- Directory for csv files of final arbitrage training data
│   │     └── data.csv               
│   ├── pickles/                      <-- Directory for all pickle models
│   │     └── models.pkl              
│   ├── arbitrage_pickles             <-- Directory for final models after model selection
│   │     └── models.pkl              
│   │
│   ├── cm/                           <-- Directory for confusion matrices after training models
│   │
│   ├── model_perf/                   <-- Directory for performance csvs after training models


Yes, it doesn't make sense that there are two folders with the same exact data, the only difference being the subdirectories. A function was written to get the combinations for arbitrage with the subdirectories so it's necessary for now until the function is rewritten ¯\_(ツ)_/¯

#### What are the models predicting?

The models predict whether there will be an arbitrage opportunity that starts 10 mins from the prediction time and lasts for at least 30 mins, giving a user enough times to execute trades. The model will return 0 (no arbitrage), 1 (arbitrage from exchange 1 to exchange 2) and -1 (arbitrage from exchange 2 to exchange 1)

# Data

Get all of the 5 min candle data filepaths into a variable 

In [2]:
csv_filepaths = glob.glob('data/*.csv')
len(csv_filepaths) #80

80

# Functions

#### Function to get all combinations of exchanges with the same trading pair

5 min candle data should be placed in this exact folder structure for the function to run:

In [3]:
# five supported exchanges
exchanges = ['bitfinex', 'coinbase_pro', 'gemini', 'hitbtc', 'kraken']

# function to create pairs for arbitrage datasets
def get_file_pairs(exchanges):
    """This function takes in a list of exchanges and looks through data
        directories to find all possible combinations for 2 exchanges
        with the same trading pair"""
    
    # list for filenames of ohlcv csvs
    filenames = []
    
    for directory in os.listdir('all_data'):
        # .DS_Store files can mess things up, since they aren't directories     
        if directory != '.DS_Store':
            
            # for each of the files in the subdirectory
            for filename in os.listdir('all_data/' + directory):
                
                # add to list of filenames if the file is a csv
                if filename.endswith('300.csv'):
                    filenames.append(filename)
                    
    # list for pairs of csvs
    file_pairs = []
    
    # compare filenames to eachother and append them in a list
    for filename_1 in filenames:
        # filenames we haven't looped through yet
        remaining_filenames = filenames[filenames.index(filename_1)+1:]
        
        # iterate through remaining filenames
        for filename_2 in remaining_filenames:
            
            # iterate through exchanges
            for exchange in exchanges:
                # drop the exchange from the first filename and see if the
                # remaining string is contained in the second filename
                if filename_1.replace(exchange, '') in filename_2:
                    # add the pair of filenames to the list of pairs
                    file_pairs.append([filename_1, filename_2])
                    
    return file_pairs

In [4]:
pairs = get_file_pairs(exchanges)
print(len(pairs)) #95
pairs

95


[['bitfinex_eos_usdt_300.csv', 'hitbtc_eos_usdt_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'coinbase_pro_bch_btc_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'kraken_bch_btc_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'gemini_bch_btc_300.csv'],
 ['bitfinex_bch_btc_300.csv', 'hitbtc_bch_btc_300.csv'],
 ['bitfinex_etc_usd_300.csv', 'coinbase_pro_etc_usd_300.csv'],
 ['bitfinex_etc_usd_300.csv', 'kraken_etc_usd_300.csv'],
 ['bitfinex_btc_usd_300.csv', 'coinbase_pro_btc_usd_300.csv'],
 ['bitfinex_btc_usd_300.csv', 'kraken_btc_usd_300.csv'],
 ['bitfinex_btc_usd_300.csv', 'gemini_btc_usd_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'coinbase_pro_ltc_btc_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'kraken_ltc_btc_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'gemini_ltc_btc_300.csv'],
 ['bitfinex_ltc_btc_300.csv', 'hitbtc_ltc_btc_300.csv'],
 ['bitfinex_dash_usd_300.csv', 'coinbase_pro_dash_usd_300.csv'],
 ['bitfinex_dash_usd_300.csv', 'kraken_dash_usd_300.csv'],
 ['bitfinex_dash_btc_300.csv', 'coinbase_pro_dash_bt

#### OHLCV Data Resampling

In [5]:
def resample_ohlcv(df, period):
    """ Changes the time period on cryptocurrency ohlcv data.
        Period is a string denoted by '{time_in_minutes}T'(ex: '1T', '5T', '60T')."""

    # Set date as the index. This is needed for the function to run
    df = df.set_index(['date'])

    # Aggregation function
    ohlc_dict = {'open':'first',                                                                                                    
                 'high':'max',                                                                                                       
                 'low':'min',                                                                                                        
                 'close': 'last',                                                                                                    
                 'base_volume': 'sum'}

    # Apply resampling
    df = df.resample(period, how=ohlc_dict, closed='left', label='left')
    
    return df

#### Filling NaNs

In [6]:
# resample_ohlcv function will create NaNs in df where there were gaps in the data.
# The gaps could be caused by exchanges being down, errors from cryptowatch or the 
# exchanges themselves

def fill_nan(df):
    """Iterates through a dataframe and fills NaNs with appropriate 
        open, high, low, close values."""

    # Forward fill close column.
    df['close'] = df['close'].ffill()

    # Backward fill the open, high, low rows with the close value.
    df = df.bfill(axis=1)

    return df

#### Feature engineering - before merge

In [7]:
def engineer_features(df, period='5T'):
    """Takes a df, engineers ta features, and returns a df
       default period=['5T']"""
    
    # convert unix closing_time to datetime
    df['date'] = pd.to_datetime(df['closing_time'], unit='s')
    
    # time resampling to fill gaps in data
    df = resample_ohlcv(df, period)
    
    # move date off the index
    df = df.reset_index()
    
    # create closing_time
    closing_time = df.date.values
    df.drop(columns='date', inplace=True)
    
    # create feature to indicate where rows were gaps in data
    df['nan_ohlcv'] = df['close'].apply(lambda x: 1 if pd.isnull(x) else 0)
    
    # fill gaps in data
    df = fill_nan(df)

    # adding all the technical analysis features...
    df = add_all_ta_features(df, 'open', 'high', 'low', 'close','base_volume', fillna=True)
    
    # add closing time column
    df['closing_time'] = closing_time
    
    return df

#### Feature Engineering - after merge

In [8]:
def get_higher_closing_price(df):
    """returns the exchange with the higher closing price"""
    
    # exchange 1 has higher closing price
    if (df['close_exchange_1'] - df['close_exchange_2']) > 0:
        return 1
    
    # exchange 2 has higher closing price
    elif (df['close_exchange_1'] - df['close_exchange_2']) < 0:
        return 2
    
    # closing prices are equivalent
    else:
        return 0

def get_pct_higher(df):
    """returns the percentage of the difference between ex1/ex2 
        closing prices"""
    
    # if exchange 1 has a higher closing price than exchange 2
    if df['higher_closing_price'] == 1:
        
        # % difference
        return ((df['close_exchange_1'] / 
                 df['close_exchange_2'])-1)*100
    
    # if exchange 2 has a higher closing price than exchange 1
    elif df['higher_closing_price'] == 2:
        
        # % difference
        return ((df['close_exchange_2'] / 
                 df['close_exchange_1'])-1)*100
    
    # if closing prices are equivalent
    else:
        return 0

def get_arbitrage_opportunity(df):
    """function to create column showing available arbitrage opportunities"""
    
    # assuming the total fees are 0.55%, if the higher closing price is less
    # than 0.55% higher than the lower closing price...
    if df['pct_higher'] < .55:
        return 0 # no arbitrage
    
    # if exchange 1 closing price is more than 0.55% higher
    # than the exchange 2 closing price
    elif df['higher_closing_price'] == 1:
        return -1 # arbitrage from exchange 2 to exchange 1
    
    # if exchange 2 closing price is more than 0.55% higher
    # than the exchange 1 closing price
    elif df['higher_closing_price'] == 2:
        return 1 # arbitrage from exchange 1 to exchange 2

def get_window_length(df):
    """function to create column showing how long arbitrage opportunity has lasted"""
    
    # convert arbitrage_opportunity column to a list
    target_list = df['arbitrage_opportunity'].to_list()
    
    # set initial window length 
    window_length = 5 # time in minutes
    
    # list for window_lengths
    window_lengths = []
    
    # iterate through arbitrage_opportunity column
    for i in range(len(target_list)):
        
        # check if a value in the arbitrage_opportunity column is equal to the
        # previous value in the arbitrage_opportunity column and increase
        # window length
        if target_list[i] == target_list[i-1]:
            window_length += 5
            window_lengths.append(window_length)
            
        # if a value in the arbitrage_opportunity column is
        # not equal to the previous value in the arbitrage_opportunity column
        # reset the window length to five minutes
        else:
            window_length = 5
            window_lengths.append(window_length)
            
    # create window length column showing how long an arbitrage opportunity has lasted
    df['window_length'] = window_lengths

    return df
        

def merge_dfs(df1, df2):
    """function to merge dataframes and create final features for arbitrage data"""
    
    # merging two modified ohlcv dfs on closing time to create arbitrage df
    df = pd.merge(df1, df2, on='closing_time',
                  suffixes=('_exchange_1', '_exchange_2'))

    # convert closing_time to datetime
    df['closing_time'] = pd.to_datetime(df['closing_time']) 

    # Create additional date features.
    df['year'] = df['closing_time'].dt.year
    df['month'] = df['closing_time'].dt.month
    df['day'] = df['closing_time'].dt.day
    
    # get higher_closing_price feature to create pct_higher feature
    df['higher_closing_price'] = df.apply(get_higher_closing_price, axis=1)
    
    # get pct_higher feature to create arbitrage_opportunity feature
    df['pct_higher'] = df.apply(get_pct_higher, axis=1)
    
    # create arbitrage_opportunity feature
    df['arbitrage_opportunity'] = df.apply(get_arbitrage_opportunity, axis=1)
    
    # create window_length feature
    df = get_window_length(df)
    
    return df

#### Creating the target

In [9]:
# specifying arbitrage window length to target, in minutes
interval = 30

def get_target_value(df, interval=30):
    """function to get target values; takes df and window length to target"""
    
    # if the coming arbitrage window is as long as the targeted interval
    if df['window_length_shift'] >= interval:
        # if that window is for exchange 1 to 2
        if df['arbitrage_opportunity_shift'] == 1:
            return 1 # arbitrage from exchange 1 to 2
        
        # if that window is for exchange 2 to 1
        elif df['arbitrage_opportunity_shift'] == -1:
            return -1 # arbitrage from exchange 2 to 1
        
        # if no arbitrage opportunity
        elif df['arbitrage_opportunity_shift'] == 0:
            return 0 # no arbitrage opportunity
        
    # if the coming window is less than our targeted interval
    else:
        return 0 # no arbitrage opportunity
    

def get_target(df, interval=interval):
    """function to create target column"""
    
    # used to shift rows
    # assumes candle length is five minutes, interval is 30 mins
    rows_to_shift = int(-1*(interval/5)) # -7
    
    # arbitrage_opportunity feature, shifted by length of targeted interval
    # minus one to predict ten minutes in advance rather than five
    df['arbitrage_opportunity_shift'] = df['arbitrage_opportunity'].shift(
        rows_to_shift - 1)
    
    # window_length feature, shifted by length of targeted interval minus one
    # to predict ten minutes
    df['window_length_shift'] = df['window_length'].shift(rows_to_shift - 1)
    
    # creating target column; this will indicate if an arbitrage opportunity
    # that lasts as long as the targeted interval is forthcoming
    df['target'] = df.apply(get_target_value, axis=1)
    
    # dropping rows where target could not be calculated due to shift
    df = df[:rows_to_shift - 1] # -7
    
    return df

def get_close_shift(df, interval=interval):
    
    rows_to_shift = int(-1*(interval/5))
    df['close_exchange_1_shift'] = df['close_exchange_1'].shift(
        rows_to_shift - 2)
    df['close_exchange_2_shift'] = df['close_exchange_2'].shift(
        rows_to_shift - 2)
    return df

# function to create profit feature
def get_profit(df):
    """function to create profit feature"""
    
    # if exchange 1 has the higher closing price
    if df['higher_closing_price'] == 1:
        # return how much money you would make if you bought on exchange 2, sold
        # on exchange 1, and took account of 0.55% fees
        return (((df['close_exchange_1_shift'] / 
                 df['close_exchange_2'])-1)*100)-.55
    
    # if exchange 2 has the higher closing price
    elif df['higher_closing_price'] == 2:
        # return how much money you would make if you bought on exchange 1, sold
        # on exchange 2, and took account of 0.55% fees
        return (((df['close_exchange_2_shift'] / 
                 df['close_exchange_1'])-1)*100)-.55
    
    # if the closing prices are the same
    else:
        return 0 # no trade

#### Split names when in the format exchange_trading_pair

In [10]:
# coinbase_pro has an extra underscore so we need a function to split it differently
def get_exchange_trading_pair(ex_tp):
    
    # coinbase_pro
    if len(ex_tp.split('_')) == 4:
        exchange = ex_tp.split('_')[0] + '_' + ex_tp.split('_')[1]
        trading_pair = ex_tp.split('_')[2] + '_' + ex_tp.split('_')[3]
    
    # all other exchanges
    else:
        exchange = ex_tp.split('_')[0]
        trading_pair = ex_tp.split('_')[1] + '_' + ex_tp.split('_')[2]
        
    return exchange, trading_pair

### Generate all individual csv's with ta data (~1-2 hours)

create /ta_data directory before running this function

In [11]:
def create_all_arbitrage_csvs(csv_filepaths):
    """Takes a csv filename, creates a dataframe, engineers features,
        and saves it as a new csv in /ta_data."""
    n = 1
    for file in csv_filepaths:
        
        # create df
        df = pd.read_csv(file, index_col=0)[:1000]
        
        # define period
        period = '5T'
        
        # engineer features
        df = engineer_features(df, period)
        print('features engineered')
        
        filename = 'ta_data/' + file.split('/')[1][:-4] + '_ta.csv'
        print(filename)
        df.to_csv(filename)
        print(f'csv #{n} saved :)')
        n+=1

In [44]:
create_all_arbitrage_csvs(csv_filepaths)

Notes:
- create a /arb_data directory before running this function
- this function takes a really long time to run so it's recommended to run in sagemaker and divide the pairs in to 4 notebooks so you're running about 20 pairs in each notebook. Should take ~2-3 hours if split up on 4 notebooks.

In [12]:
def create_arb_csvs(pairs):
    
    counter = 0

    for pair in pairs:
        
        # define paths for the csv
        csv_1, csv_2 = 'ta_data/' + pair[0][:-4] + '_ta.csv', 'ta_data/' + pair[1][:-4] + '_ta.csv'
        
        # define exchanges and trading_pairs
        ex_tp_1, ex_tp_2 = pair[0][:-8], pair[1][:-8]
        exchange_1, trading_pair_1 = get_exchange_trading_pair(ex_tp_1)
        exchange_2, trading_pair_2 = get_exchange_trading_pair(ex_tp_2)
        print(exchange_1, trading_pair_1,  exchange_2, trading_pair_2)
        
        # define model_name
        model_name = exchange_1 + '_' + ex_tp_2
        print(model_name)
          
        # create dfs from csv's that already include ta features
        df1, df2 = pd.read_csv(csv_1, index_col=0), pd.read_csv(csv_2, index_col=0)       
        print('df 1 shape: ', df1.shape, 'df 2 shape: ', df2.shape)
        
        # merge dfs
        df = merge_dfs(df1, df2)
        print('dfs merged')
        print('merged df shape:' , df.shape)
        
        # create target 
        df = get_target(df)
        print(model_name, ' ', df.shape)
        
        # export csv
        path = 'arb_data/'
        csv_filename = path + model_name + '.csv'
        df.to_csv(csv_filename)
        
        counter += 1
        print(counter, '\n')

In [43]:
create_arb_csvs(pairs)

bitfinex eos_usdt hitbtc eos_usdt
bitfinex_hitbtc_eos_usdt
df 1 shape:  (59409, 69) df 2 shape:  (247395, 69)
dfs merged
merged df shape: (59409, 144)
bitfinex_hitbtc_eos_usdt   (59402, 147)
1 



#### Get arbitrage data csvs into a variable

In [13]:
arb_data_paths = glob.glob('arb_data/*.csv')
print(len(arb_data_paths))

95


#### Generate models

##### Notes:
- create /pickles and /arbitrage_pickles directories before running this function
- test that this function will run to completion before running fully
- this function takes a really long time to run so it's recommended to run in sagemaker and divide the pairs in to 4 notebooks so you're running about 20 pairs in each notebook. Should take ~4 hours if split up on 4 notebooks.

In [14]:
def create_models(arb_data_paths):
    
    counter = 0
    line = '---------------'
    performance_list = []
    confusion_dict = {}

    # this is in case the function stops running you can pick up where you left off
    # get all model paths into a variable
    model_paths = glob.glob('pickles/*.pkl')
    
    # iterate through the arbitrage csvs
    for file in arb_data_paths:
        
        # define model name
        name = file.split('/')[1][:-8]
        
        # read csv
        df = pd.read_csv(file, index_col=0)
        
        # convert str closing_time to datetime
        df['closing_time'] = pd.to_datetime(df['closing_time']) 
        
        print('\n' + line*5 + '\n' + line*2 + name.upper() + line*2 + '\n' + line*5)

        # 70/30 train/test split
        test_train_split_row = round(len(df)*.7)
        
        # get closing_time for t/t split
        test_train_split_time = df['closing_time'][test_train_split_row]

        # remove 1 week from each end of the t/t datasets to create a 
        # two week gap between the data - prevents data leakage
        train_cutoff_time = test_train_split_time - dt.timedelta(days=7)
        test_cutoff_time = test_train_split_time + dt.timedelta(days=7)
        print('cutoff time:', train_cutoff_time, test_cutoff_time)
        
        # train and test subsets
        train = df[df['closing_time'] < train_cutoff_time]
        test = df[df['closing_time'] > test_cutoff_time]
        
        # printing shapes to track progress
        print('train and test shape: ', train.shape, test.shape)
        
        # pick features
        # not using open, high, or low, which are highly correlated with close 
        # and do not improve model performance
        features = ['close_exchange_1','base_volume_exchange_1', 
                    'nan_ohlcv_exchange_1','volume_adi_exchange_1', 'volume_obv_exchange_1',
                    'volume_cmf_exchange_1', 'volume_fi_exchange_1','volume_em_exchange_1', 
                    'volume_vpt_exchange_1','volume_nvi_exchange_1', 'volatility_atr_exchange_1',
                    'volatility_bbhi_exchange_1','volatility_bbli_exchange_1', 
                    'volatility_kchi_exchange_1', 'volatility_kcli_exchange_1',
                    'volatility_dchi_exchange_1','volatility_dcli_exchange_1',
                    'trend_macd_signal_exchange_1', 'trend_macd_diff_exchange_1', 'trend_adx_exchange_1',
                    'trend_adx_pos_exchange_1', 'trend_adx_neg_exchange_1',
                    'trend_vortex_ind_pos_exchange_1', 'trend_vortex_ind_neg_exchange_1', 
                    'trend_vortex_diff_exchange_1', 'trend_trix_exchange_1',
                    'trend_mass_index_exchange_1', 'trend_cci_exchange_1',
                    'trend_dpo_exchange_1', 'trend_kst_sig_exchange_1',
                    'trend_kst_diff_exchange_1', 'trend_aroon_up_exchange_1',
                    'trend_aroon_down_exchange_1',
                    'trend_aroon_ind_exchange_1',
                    'momentum_rsi_exchange_1', 'momentum_mfi_exchange_1',
                    'momentum_tsi_exchange_1', 'momentum_uo_exchange_1',
                    'momentum_stoch_signal_exchange_1',
                    'momentum_wr_exchange_1', 'momentum_ao_exchange_1',
                    'others_dr_exchange_1', 'close_exchange_2',
                    'base_volume_exchange_2', 'nan_ohlcv_exchange_2',
                    'volume_adi_exchange_2', 'volume_obv_exchange_2',
                    'volume_cmf_exchange_2', 'volume_fi_exchange_2',
                    'volume_em_exchange_2', 'volume_vpt_exchange_2',
                    'volume_nvi_exchange_2', 'volatility_atr_exchange_2',
                    'volatility_bbhi_exchange_2', 
                    'volatility_bbli_exchange_2',
                    'volatility_kchi_exchange_2',
                    'volatility_kcli_exchange_2',
                    'volatility_dchi_exchange_2',
                    'volatility_dcli_exchange_2',
                    'trend_macd_signal_exchange_2',
                    'trend_macd_diff_exchange_2', 'trend_adx_exchange_2',
                    'trend_adx_pos_exchange_2', 'trend_adx_neg_exchange_2',
                    'trend_vortex_ind_pos_exchange_2',
                    'trend_vortex_ind_neg_exchange_2',
                    'trend_vortex_diff_exchange_2', 'trend_trix_exchange_2',
                    'trend_mass_index_exchange_2', 'trend_cci_exchange_2',
                    'trend_dpo_exchange_2', 'trend_kst_sig_exchange_2',
                    'trend_kst_diff_exchange_2', 'trend_aroon_up_exchange_2',
                    'trend_aroon_down_exchange_2',
                    'trend_aroon_ind_exchange_2',
                    'momentum_rsi_exchange_2', 'momentum_mfi_exchange_2',
                    'momentum_tsi_exchange_2', 'momentum_uo_exchange_2',
                    'momentum_stoch_signal_exchange_2',
                    'momentum_wr_exchange_2', 'momentum_ao_exchange_2',
                    'others_dr_exchange_2', 'year', 'month', 'day',
                    'higher_closing_price', 'pct_higher', 
                    'arbitrage_opportunity', 'window_length']
        
        # pick target
        target = 'target'
        
        # X, y matrix
        X_train = train[features]
        X_test = test[features]
        y_train = train[target]
        y_test = test[target]
        print('train test shapes:', X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        
        # filter out datasets that are too small
        if (X_train.shape[0] > 1000) and (X_test.shape[0] > 0):
            
#             max_depth_list = [14] # just for testing to see if function completes
            max_depth_list = [14, 15, 17, 18, 21, 25]
            for max_depth in max_depth_list:

#                 max_features_list = [50] # just for testing to see if function completes
                max_features_list = [50, 55, 60, 65, 70, 75]
                for max_features in max_features_list:

#                     n_estimator_list = [100] # just for testing to see if function completes
                    n_estimator_list = [100, 150]
                    for n_estimators in n_estimator_list:

                        # define model 
                        model_name = name + '_' + str(max_features) + '_' + str(max_depth) + '_' + str(n_estimators)
                        print(line + model_name + line)

                        # define model filename to check if it exists
                        model_path = f'pickles/{model_name}.pkl'

                        # check if the model is in the folder of models
                        if model_path not in model_paths:
                            
                            # instantiate model
                            model = RandomForestClassifier(max_features=max_features, 
                                                           max_depth=max_depth, 
                                                           n_estimators=n_estimators, 
                                                           n_jobs=-1, 
                                                           random_state=42)

                            try:
                                # fit model
                                model = model.fit(X_train, y_train)
                                print('model fitted!')

                                # train accuracy
                                train_score = model.score(X_train, y_train)
                                print('train accuracy:', train_score)

                                # make predictions
                                y_preds = model.predict(X_test)
                                print('predictions made!')

                                # test accuracy
                                score = accuracy_score(y_test, y_preds)
                                print('test accuracy:', score)

                                # save model
                                pickle.dump(model, open('pickles/{model_name}.pkl'.format(
                                            model_name=model_name), 'wb'))
                                print('pickle saved!'.format(model_name=model_name))

                            except:
                                print(line*3 + '\n' + line + 'ERROR' + line + '\n' + line*3)
                                break # break out of for loop if there is an error with modeling

                        else: # if the model exists

                            # load model
                            model = pickle.load(open(model_path, 'rb'))
                            print('model loaded')

                            # train accuracy
                            train_score = model.score(X_train, y_train)
                            print('train accuracy:', train_score)

                            # make predictions
                            y_preds = model.predict(X_test)
                            print('predictions made!')

                            # test accuracy
                            score = accuracy_score(y_test, y_preds)
                            print('test accuracy:', score)
                        
                        ######## Performance metrics ########
                        # labels for confusion matrix
                        unique_y_test = y_test.unique().tolist()
                        unique_y_preds = list(set(y_preds))
                        labels = list(set(unique_y_test + unique_y_preds))
                        labels.sort()
                        columns = [f'Predicted {label}' for label in labels]
                        index = [f'Actual {label}' for label in labels]

                        # create confusion matrix
                        confusion = pd.DataFrame(confusion_matrix(y_test, y_preds),
                                                 columns=columns, index=index)
                        print(model_name + ' confusion matrix:')
                        print(confusion, '\n')

                        # append to confusion list
                        confusion_dict[model_name] = confusion

                        # creating dataframe from test set to calculate profitability
                        test_with_preds = X_test.copy()

                        # add column with higher closing price
                        test_with_preds['higher_closing_price'] = test_with_preds.apply(
                                get_higher_closing_price, axis=1)

                        # add column with shifted closing price
                        test_with_preds = get_close_shift(test_with_preds)

                        # adding column with predictions
                        test_with_preds['pred'] = y_preds

                        # adding column with profitability of predictions
                        test_with_preds['pct_profit'] = test_with_preds.apply(
                                get_profit, axis=1).shift(-2)

                        # filtering out rows where no arbitrage is predicted
                        test_with_preds = test_with_preds[test_with_preds['pred'] != 0]

                        # calculating mean profit where arbitrage predicted...
                        pct_profit_mean = test_with_preds['pct_profit'].mean()

                        # calculating median profit where arbitrage predicted...
                        pct_profit_median = test_with_preds['pct_profit'].median()
                        print('percent profit mean:', pct_profit_mean)
                        print('percent profit median:', pct_profit_median, '\n\n')

                        # save net performance to list
                        performance_list.append([name, max_features, max_depth, n_estimators,
                                                 pct_profit_mean, pct_profit_median])
                        
                        
                        
        # i.e., if there are less than 1000 rows on which to train...
        else:
            print('{model_name}: not enough data!'.format(model_name=name))
        
        # update count
        counter += 1
        print(counter, '\n')
        
    # create a dataframe for performace of all models
    df = pd.DataFrame(performance_list, columns = ['ex_tp', 'max_features', 'max_depth', 
                                                   'n_estimators', 'pct_profit_mean','pct_profit_median'])
    

    return df, confusion_dict

In [17]:
df, confusion_dict = create_models(arb_data_paths)


---------------------------------------------------------------------------
------------------------------BITFINEX_GEMINI_BCH------------------------------
---------------------------------------------------------------------------
cutoff time: 2019-10-16 02:50:00 2019-10-30 02:50:00
train and test shape:  (6414, 147) (1596, 147)
train test shapes: (6414, 91) (1596, 91) (6414,) (1596,)
---------------bitfinex_gemini_bch_50_14_100---------------
model fitted!
train accuracy: 0.9873713751169317
predictions made!
test accuracy: 0.7142857142857143
pickle saved!
bitfinex_gemini_bch_50_14_100 confusion matrix:
           Predicted -1  Predicted 0  Predicted 1
Actual -1            20           53            1
Actual 0             27          945          170
Actual 1              0          205          175 

percent profit mean: 0.4825669023900904
percent profit median: 0.24872418149823372 


1 



#### Export model performance data into csvs and JSON 

You will need /model_perf and /cm directories to store the performance csv's and JSON if you split up running the models on several notebooks

In [22]:
# exporting model performance to csv
df.to_csv('model_perf/perf1.csv', index=False)

# exporting confusion matrices to json
class JSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if hasattr(obj, 'to_json'):
            return obj.to_json(orient='records')
        return json.JSONEncoder.default(self, obj)

with open('cm/confusion1.json', 'w') as fp:
    json.dump(confusion_dict, fp, cls=JSONEncoder)

#### Concatenate performance and confusion matrices

In [24]:
def concat_perf_dfs(filepaths):
    
    df_list = []
    for path in filepaths:
        df = pd.read_csv(path)
        df_list.append(df)
    
    df = pd.concat(df_list)
    df = df.sort_values(by='pct_profit_mean', ascending=False)
    
    return df

def concat_dicts(filepaths):
    
    confusion_dict = {}
    
    for path in filepaths:
        confusion = json.load(open(path))
        confusion_dict.update(confusion)
    
    return confusion_dict

In [29]:
# check the number of files you have in each folder
# if you ran on 4 notebooks you should have 4

perf_csv_paths = glob.glob('model_perf/*.csv')
confusion_paths = glob.glob('cm/*.json')

print(len(perf_csv_paths))
print(len(confusion_paths))

1
1


In [30]:
# concatenate all confusion matrices
confusion_dict = concat_dicts(confusion_paths)

# concatenate all performance dataframes
perf_df = concat_perf_dfs(perf_csv_paths)

print(len(confusion_dict.keys()))
print(len(perf_df))

1
1


#### look at performance dataframe

In [35]:
perf_df

Unnamed: 0,ex_tp,max_features,max_depth,n_estimators,pct_profit_mean,pct_profit_median,correct_arb,pct_wrong_0,pct_wrong_1,pct_wrong_neg1,correct_arb_neg1,correct_arb_1,correct_arb_0
0,coinbase_pro_hitbtc_bch_btc,60,17,150,9.692778,11.240564,60.0,0.000862,,0.062500,60.0,0.0,28974.0
1,coinbase_pro_hitbtc_bch_btc,60,17,100,9.583432,11.099366,54.0,0.001069,,0.052632,54.0,0.0,28975.0
2,coinbase_pro_hitbtc_bch_btc,50,14,150,9.572652,11.296690,58.0,0.000931,,0.079365,58.0,0.0,28973.0
3,coinbase_pro_hitbtc_bch_btc,60,14,150,9.275228,10.927262,64.0,0.000724,,0.085714,64.0,0.0,28972.0
4,coinbase_pro_hitbtc_bch_btc,60,14,100,9.252052,11.141901,63.0,0.000759,,0.100000,63.0,0.0,28971.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6379,coinbase_pro_kraken_ltc_usd,65,25,150,,,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
6380,coinbase_pro_kraken_ltc_usd,70,25,100,,,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
6381,coinbase_pro_kraken_ltc_usd,70,25,150,,,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0
6382,coinbase_pro_kraken_ltc_usd,75,25,100,,,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0


#### look at confusion dict

In [33]:
confusion_dict

{'bitfinex_gemini_bch_50_14_100': '[{"Predicted -1":20,"Predicted 0":53,"Predicted 1":1},{"Predicted -1":27,"Predicted 0":945,"Predicted 1":170},{"Predicted -1":0,"Predicted 0":205,"Predicted 1":175}]'}

### function for creating df with pnl and confusion matrix features

In [37]:
def model_confusion(df, confusion_dict):
    
    # create a copy of df to not overwrite original
    df = df.copy()
    
    line = '-------'
    feature_dict = {}
    model_name_list = []
    
    # iterate through all models
    for i in range(len(df)):
        
        # define model name
        model_name = (df.ex_tp.iloc[i] + '_' + str(df.max_features.iloc[i]) 
                      + '_' + str(df.max_depth.iloc[i]) + '_' + str(df.n_estimators.iloc[i]))
        model_name_list.append(model_name)
        
        # create confusion matrix for specific model
        conf_mat = pd.read_json(confusion_dict[model_name])

        #########################################################
        ############## create confusion features ################
        #########################################################
        
        # confusion matrix has -1, 0, 1 predictions
        if 'Predicted 1' in conf_mat.columns and 'Predicted -1' in conf_mat.columns:

            # % incorrect predictions for 0, 1, -1
            pct_wrong_0 = (conf_mat['Predicted 0'].loc[0] + 
                           conf_mat['Predicted 0'].loc[2])/conf_mat['Predicted 0'].sum()
            pct_wrong_1 = (conf_mat['Predicted 1'].loc[0] + 
                           conf_mat['Predicted 1'].loc[1])/conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = (conf_mat['Predicted -1'].loc[1] + 
                               conf_mat['Predicted -1'].loc[2])/conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[2]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]

        # confusion matrix has 0, 1 predictions
        elif 'Predicted 1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[1] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = conf_mat['Predicted 1'].loc[0] / conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = np.nan

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = 0

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[1]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[0]
        
        # confusion matrix has -1, 0 predictions
        elif 'Predicted -1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[0] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = np.nan
            pct_wrong_neg1 = conf_mat['Predicted -1'].loc[1] / conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = 0
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]
        
        # confusion matrix has only 0
        else:
            pct_wrong_0 = 0
            pct_wrong_1 = 0
            pct_wrong_neg1 = 0
            correct_arb = 0
            correct_arb_neg1 = 0
            correct_arb_1 = 0
            correct_arb_0 = 0

        
        # add confusion features to dict
        feature_list = [correct_arb, pct_wrong_0, pct_wrong_1, pct_wrong_neg1, 
                        correct_arb_neg1, correct_arb_1, correct_arb_0]
        feature_dict[model_name] = feature_list

    # create a df from the new features
    columns = ['correct_arb', 'pct_wrong_0', 'pct_wrong_1', 'pct_wrong_neg1', 
                'correct_arb_neg1', 'correct_arb_1', 'correct_arb_0']
    df2 = pd.DataFrame(feature_dict).transpose().reset_index()
    df2 = df2.rename(columns = {'index': 'model_name', 0: 'correct_arb', 1:'pct_wrong_0', 
                                2: 'pct_wrong_1', 3: 'pct_wrong_neg1', 
                                4: 'correct_arb_neg1', 5: 'correct_arb_1', 
                                6: 'correct_arb_0'})
    
    # merge new features with performance df
    df['model_name'] = model_name_list
    print(df.shape, df2.shape)
    df = df.merge(df2, on='model_name').drop(columns = 'model_name')
    print('shape after merge:', df.shape)

    # filter for models that are predicting arb when its not happening < 15% of the time
    df2 = df[df['pct_wrong_0'] < 0.15]
    print('shape after filetering pct_wrong_0:', df.shape)

    # filter for models that predict > 25 correct arb 
    df2 = df2[df2['correct_arb'] > 25]
    print('shape after filtering correct_arb:', df.shape)
    
    # filter for models that make > 0.20% profit
    df2 = df2[df2['pct_profit_mean'] > 0.2]
    print('shape after filtering pct_profit_mean:', df.shape)
    
    df2 = df2.sort_values(by=['correct_arb'], ascending=False)
    df2 = df2.drop_duplicates(subset='ex_tp')
    
    print('shape after droping duplicates:', df.shape)
    
    # return 2 dataframes
    # df1 has performance for all models
    # df2 has performance for filtered models
    return df, df2

In [None]:
df, df2 = model_confusion(perf_df, confusion_dict)
df

In [None]:
# filter for models that are predicting arb when its not happening < 15% of the time
df3 = df[df['pct_wrong_0'] < 0.30]
print('shape after filetering pct_wrong_0:', df3.shape)

# filter for models that predict > 25 correct arb 
df3 = df3[df3['correct_arb'] > 25]
print('shape after filtering correct_arb:', df3.shape)

# filter for models that make > 0.20% profit
df3 = df3[df3['pct_profit_mean'] > 0.2]
print('shape after filtering pct_profit_mean:', df3.shape)

df3 = df3.sort_values(by=['correct_arb'], ascending=False)
df3 = df3.drop_duplicates(subset='ex_tp')

print('shape after droping duplicates:', df3.shape)

#### Move best models into a new folder

In [38]:
model_paths = glob.glob('arb_pickles/*.pkl')

In [None]:
# info for model names
models = df3['ex_tp'].values[1:]
max_features = df3['max_features'].values[1:]
max_depth = df3['max_depth'].values[1:]
n_estimators = df3['n_estimators'].values[1:]

print(len(models), len(max_features), len(max_depth), len(n_estimators))

for i in range(len(models)):
    
    # define model name
    model_name = models[i] + '_' + str(max_features[i]) + '_' + str(max_depth[i]) + '_' + str(n_estimators[i])
    
    # rename the filepath to move
    os.rename(f'pickles/{model_name}.pkl', f'arb_pickles/{models[i]}.pkl')