In [1]:
import glob
import os
import pickle
import json
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import datetime as dt

In [2]:
perf_csv_paths = glob.glob('model_performance/*.csv')[:5]
len(perf_csv_paths)
# print(perf_csv_paths[:5])

5

In [3]:
def concat_dfs(filepaths):
    
    df_list = []
    for path in filepaths:
        df = pd.read_csv(path)
        df_list.append(df)
    
    df = pd.concat(df_list)
    df = df.sort_values(by='pct_profit_mean', ascending=False)
    
    return df

In [4]:
perf_df = concat_dfs(perf_csv_paths)
print(perf_df.shape)
perf_df.head(50)
df3 = perf_df.drop_duplicates()
df3.shape

(76, 5)


(76, 5)

In [9]:
good_models = perf_df[perf_df['pct_profit_mean'] > 0.3]
good_models

ex_tp = good_models.ex_tp.values
max_features = good_models.max_features.values
max_depth = good_models.max_depth.values
good_model_names = []
for i in range(len(ex_tp)):
    model_name = ex_tp[i] + '_' + str(max_features[i]) + '_' + str(max_depth[i])
    good_model_names.append(model_name)
    
print(len(good_model_names))

29


In [10]:
confusion_paths = glob.glob('confusion_json/*.json')
print(len(confusion_paths))

5


In [11]:
def concat_dicts(filepaths):
    
    confusion_dict = {}
    
    for path in filepaths:
        confusion = json.load(open(path))
        confusion_dict.update(confusion)
    
    return confusion_dict

In [12]:
confusion_dict = concat_dicts(confusion_paths)
print(len(confusion_dict))

1520


In [13]:
confusion_dict[good_model_names[0]]
confusion_matrix = pd.read_json(confusion_dict[good_model_names[0]])
confusion_matrix['Predicted 0']

0       11
1    19651
2        4
Name: Predicted 0, dtype: int64

In [14]:
confusion_dict.keys()

dict_keys(['bitfinex_gemini_ltc_btc_40_17', 'bitfinex_gemini_ltc_btc_45_17', 'bitfinex_gemini_ltc_btc_50_17', 'bitfinex_gemini_ltc_btc_55_17', 'bitfinex_gemini_ltc_btc_60_17', 'bitfinex_gemini_ltc_btc_40_20', 'bitfinex_gemini_ltc_btc_45_20', 'bitfinex_gemini_ltc_btc_50_20', 'bitfinex_gemini_ltc_btc_55_20', 'bitfinex_gemini_ltc_btc_60_20', 'bitfinex_gemini_ltc_btc_40_25', 'bitfinex_gemini_ltc_btc_45_25', 'bitfinex_gemini_ltc_btc_50_25', 'bitfinex_gemini_ltc_btc_55_25', 'bitfinex_gemini_ltc_btc_60_25', 'bitfinex_gemini_ltc_btc_40_27', 'bitfinex_gemini_ltc_btc_45_27', 'bitfinex_gemini_ltc_btc_50_27', 'bitfinex_gemini_ltc_btc_55_27', 'bitfinex_gemini_ltc_btc_60_27', 'coinbase_pro_hitbtc_xrp_btc_40_17', 'coinbase_pro_hitbtc_xrp_btc_45_17', 'coinbase_pro_hitbtc_xrp_btc_50_17', 'coinbase_pro_hitbtc_xrp_btc_55_17', 'coinbase_pro_hitbtc_xrp_btc_60_17', 'coinbase_pro_hitbtc_xrp_btc_40_20', 'coinbase_pro_hitbtc_xrp_btc_45_20', 'coinbase_pro_hitbtc_xrp_btc_50_20', 'coinbase_pro_hitbtc_xrp_btc_55_2

In [15]:
len(perf_df[perf_df['pct_profit_mean'] > -100 ])
# type(perf_df['pct_profit_mean'].iloc[50])
# perf_df['pct_profit_mean'].iloc[50]
perf_df.head(50)

Unnamed: 0,ex_tp,max_features,max_depth,pct_profit_mean,pct_profit_median
0,bitfinex_coinbase_pro_xrp_usd,50,20,11.721898,11.721898
0,coinbase_pro_hitbtc_bch_btc,60,17,9.999205,11.869496
1,bitfinex_coinbase_pro_bch_btc,40,27,8.762994,10.788925
0,bitfinex_coinbase_pro_bch_usd,40,17,2.722728,0.153628
0,bitfinex_coinbase_pro_btc_usd,40,20,2.490334,2.051365
0,bitfinex_coinbase_pro_eth_usd,60,17,1.875132,0.795566
2,bitfinex_coinbase_pro_etc_usd,45,17,1.815758,1.108093
1,bitfinex_coinbase_pro_ltc_usd,40,25,1.80214,1.779046
1,coinbase_pro_hitbtc_eth_btc,40,27,1.602466,1.602466
1,bitfinex_hitbtc_eth_btc,60,27,1.590873,1.890326


In [16]:
def create_confusion_features(df, confusion_dict):
    
    df = df.copy()
    
    line = '-------'
    print(len(df))
    print(len(confusion_dict.keys()))
    feature_dict = {}
    model_name_list = []
    for i in range(len(df)):
        
        # define model name
        model_name = (df.ex_tp.iloc[i] + '_' + str(df.max_features.iloc[i]) 
                      + '_' + str(df.max_depth.iloc[i]))
        print(line*3 + model_name + line*3)
        model_name_list.append(model_name)
        
        # create confusion matrix for specific model
        confusion_matrix = pd.read_json(confusion_dict[model_name])
        print(confusion_matrix)

        # create confusion features
        if 'Predicted 1' in confusion_matrix.columns and 'Predicted -1' in confusion_matrix.columns:
            print('starting both')
            # % incorrect predictions for 0, 1, -1
            pct_wrong_0 = (confusion_matrix['Predicted 0'].loc[0] + 
                           confusion_matrix['Predicted 0'].loc[2])/confusion_matrix['Predicted 0'].sum()
            pct_wrong_1 = (confusion_matrix['Predicted 1'].loc[0] + 
                           confusion_matrix['Predicted 1'].loc[1])/confusion_matrix['Predicted 1'].sum()
            pct_wrong_neg1 = (confusion_matrix['Predicted -1'].loc[1] + 
                               confusion_matrix['Predicted -1'].loc[2])/confusion_matrix['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = confusion_matrix['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = confusion_matrix['Predicted 1'].loc[2]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = confusion_matrix['Predicted 0'].loc[1]


        elif 'Predicted 1' in confusion_matrix.columns:
            print('pred 1')

            pct_wrong_0 = confusion_matrix['Predicted 0'].loc[1] / confusion_matrix['Predicted 0'].sum()
            pct_wrong_1 = confusion_matrix['Predicted 1'].loc[0] / confusion_matrix['Predicted 1'].sum()
            pct_wrong_neg1 = np.nan
            print('----------------NAN---------------')

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = 0

            # total number correct arbitrage preds (1)
            correct_arb_1 = confusion_matrix['Predicted 1'].loc[1]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = confusion_matrix['Predicted 0'].loc[0]

        elif 'Predicted -1' in confusion_matrix.columns:
            print('pred neg 1')

            pct_wrong_0 = confusion_matrix['Predicted 0'].loc[0]/confusion_matrix['Predicted 0'].sum()
            pct_wrong_1 = np.nan
            print('----------------NAN---------------')
            pct_wrong_neg1 = confusion_matrix['Predicted -1'].loc[1]/confusion_matrix['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = confusion_matrix['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = 0
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = confusion_matrix['Predicted 0'].loc[1]
        
        else:
            pct_wrong_0 = 0
            pct_wrong_1 = 0
            pct_wrong_neg1 = 0
            correct_arb = 0
            correct_arb_neg1 = 0
            correct_arb_1 = 0
            correct_arb_0 = 0
            print('ERROR')

        
        # add new features to dict
        feature_list = [pct_wrong_0, pct_wrong_1, pct_wrong_neg1, correct_arb, 
                      correct_arb_neg1, correct_arb_1, correct_arb_0]
        feature_dict[model_name] = feature_list

    # create a df from the new features
    columns = ['pct_wrong_0', 'pct_wrong_1', 'pct_wrong_neg1', 'correct_arb', 
                'correct_arb_neg1', 'correct_arb_1', 'correct_arb_0']
    df2 = pd.DataFrame(feature_dict).transpose().reset_index()

    df2 = df2.rename(columns = {'index': 'model_name', 0:'pct_wrong_0', 1: 'pct_wrong_1', 
                              2: 'pct_wrong_neg1', 3: 'correct_arb', 4: 'correct_arb_neg1', 
                              5: 'correct_arb_1', 6: 'correct_arb_0'})

    print(df2.shape)
    print(len(model_name_list))
    # merge new features with performance df
    df['model_name'] = model_name_list
    print(df.shape)
    df = df.merge(df2, on='model_name')
    df.drop(columns = 'model_name', inplace=True)
    print(df.shape)
    
    return df

In [17]:
df = create_confusion_features(perf_df, confusion_dict)
df

76
1520
---------------------bitfinex_coinbase_pro_xrp_usd_50_20---------------------
   Predicted -1  Predicted 0  Predicted 1
0             0           11            0
1             0        19651            0
2             0            4            1
starting both
---------------------coinbase_pro_hitbtc_bch_btc_60_17---------------------
   Predicted -1  Predicted 0  Predicted 1
0            54           16            0
1             3        28975            0
2             0           15            0
starting both
---------------------bitfinex_coinbase_pro_bch_btc_40_27---------------------
   Predicted -1  Predicted 0  Predicted 1
0             0            7            0
1             0        28644           17
2             0           31           57
starting both
---------------------bitfinex_coinbase_pro_bch_usd_40_17---------------------
   Predicted -1  Predicted 0  Predicted 1
0            10           60            0
1           168        28436           15
2         

Unnamed: 0,ex_tp,max_features,max_depth,pct_profit_mean,pct_profit_median,pct_wrong_0,pct_wrong_1,pct_wrong_neg1,correct_arb,correct_arb_neg1,correct_arb_1,correct_arb_0
0,bitfinex_coinbase_pro_xrp_usd,50,20,11.721898,11.721898,0.000763,0.000000,,1.0,0.0,1.0,19651.0
1,coinbase_pro_hitbtc_bch_btc,60,17,9.999205,11.869496,0.001069,,0.052632,54.0,54.0,0.0,28975.0
2,bitfinex_coinbase_pro_bch_btc,40,27,8.762994,10.788925,0.001325,0.229730,,57.0,0.0,57.0,28644.0
3,bitfinex_coinbase_pro_bch_usd,40,17,2.722728,0.153628,0.002246,0.180723,0.944134,78.0,10.0,68.0,28436.0
4,bitfinex_coinbase_pro_btc_usd,40,20,2.490334,2.051365,0.375730,0.157328,0.015291,17006.0,16615.0,391.0,72479.0
...,...,...,...,...,...,...,...,...,...,...,...,...
71,bitfinex_gemini_zec_usd,40,17,,,0.018489,,,0.0,0.0,0.0,1221.0
72,kraken_gemini_eth_btc,40,17,,,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
73,coinbase_pro_kraken_ltc_usd,40,17,,,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
74,coinbase_pro_kraken_xrp_btc,40,17,,,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [18]:
pd.set_option('display.max_rows', 500)
print(df.shape)

# filter for models that make > 0.20% profit
df2 = df[df['pct_profit_mean'] > 0.2]
print(df2.shape)

# filter for models that are predicting arb when its not happening < 20% of the time
df2 = df2[df2['pct_wrong_0'] < 0.30]
print(df2.shape)
df2

# filter for models that are predict > 100 correct arb 
df2 = df2[df2['correct_arb'] > 50]
print(df2.shape)
df2

(76, 12)
(32, 12)
(30, 12)
(17, 12)


Unnamed: 0,ex_tp,max_features,max_depth,pct_profit_mean,pct_profit_median,pct_wrong_0,pct_wrong_1,pct_wrong_neg1,correct_arb,correct_arb_neg1,correct_arb_1,correct_arb_0
1,coinbase_pro_hitbtc_bch_btc,60,17,9.999205,11.869496,0.001069,,0.052632,54.0,54.0,0.0,28975.0
2,bitfinex_coinbase_pro_bch_btc,40,27,8.762994,10.788925,0.001325,0.22973,,57.0,0.0,57.0,28644.0
3,bitfinex_coinbase_pro_bch_usd,40,17,2.722728,0.153628,0.002246,0.180723,0.944134,78.0,10.0,68.0,28436.0
6,bitfinex_coinbase_pro_etc_usd,45,17,1.815758,1.108093,0.014133,0.201183,0.927083,412.0,7.0,405.0,35994.0
7,bitfinex_coinbase_pro_ltc_usd,40,25,1.80214,1.779046,0.09569,0.275641,0.195514,40429.0,40316.0,113.0,44483.0
12,gemini_hitbtc_bch_btc,50,17,1.056922,0.623694,0.166144,0.515152,0.298039,211.0,179.0,32.0,1064.0
14,coinbase_pro_gemini_bch_btc,40,20,0.981067,0.688488,0.152174,0.302682,0.6875,197.0,15.0,182.0,1092.0
15,coinbase_pro_hitbtc_eth_usdc,45,17,0.909267,0.670711,0.267515,0.228312,0.228706,5896.0,3939.0,1957.0,12724.0
17,bitfinex_gemini_bch_btc,40,27,0.834138,0.460741,0.197338,0.34296,0.52381,202.0,20.0,182.0,1025.0
19,bitfinex_hitbtc_bch_usdt,40,17,0.699128,0.570062,0.250252,0.259259,0.368905,1993.0,893.0,1100.0,9674.0


## function to create a df of features from all of the confusion matrices 

In [136]:
def model_confusion2(confusion_dict):
    
    line = '-------'

    feature_dict = {}

    for model in confusion_dict.keys():
        
#         print(line*3 + model + line*3)
        
        # create confusion matrix for specific model
        conf_mat = pd.read_json(confusion_dict[model])
#         print(conf_mat)

        # create confusion features
        if 'Predicted 1' in conf_mat.columns and 'Predicted -1' in conf_mat.columns:

            # % incorrect predictions for 0, 1, -1
            pct_wrong_0 = (conf_mat['Predicted 0'].loc[0] + 
                           conf_mat['Predicted 0'].loc[2])/conf_mat['Predicted 0'].sum()
            pct_wrong_1 = (conf_mat['Predicted 1'].loc[0] + 
                           conf_mat['Predicted 1'].loc[1])/conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = (conf_mat['Predicted -1'].loc[1] + 
                               conf_mat['Predicted -1'].loc[2])/conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[2]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]


        elif 'Predicted 1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[1] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = conf_mat['Predicted 1'].loc[0] / conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = np.nan

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = 0

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[1]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[0]

        elif 'Predicted -1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[0] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = np.nan
            pct_wrong_neg1 = conf_mat['Predicted -1'].loc[1] / conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = 0
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]
        
        else:
            pct_wrong_0 = 0
            pct_wrong_1 = 0
            pct_wrong_neg1 = 0
            correct_arb = 0
            correct_arb_neg1 = 0
            correct_arb_1 = 0
            correct_arb_0 = 0

        
        # add new features to dict
        feature_list = [pct_wrong_0, pct_wrong_1, pct_wrong_neg1, correct_arb, 
                      correct_arb_neg1, correct_arb_1, correct_arb_0]
        feature_dict[model] = feature_list

    # create a df from the new features
    columns = ['pct_wrong_0', 'pct_wrong_1', 'pct_wrong_neg1', 'correct_arb', 
                'correct_arb_neg1', 'correct_arb_1', 'correct_arb_0']
    df = pd.DataFrame(feature_dict).transpose().reset_index()

    df = df.rename(columns = {'index': 'model_name', 0:'pct_wrong_0', 1: 'pct_wrong_1', 
                              2: 'pct_wrong_neg1', 3: 'correct_arb', 4: 'correct_arb_neg1', 
                              5: 'correct_arb_1', 6: 'correct_arb_0'})

    print('shape before filtering:', df.shape)
    
    # create max_features and max_depth columns
    df['max_features'] = [(model_name.split('_'))[-2] for model_name in df.model_name]
    df['max_depth'] = [(model_name.split('_'))[-1] for model_name in df.model_name]
    df['model_name'] = ['_'.join((model_name.split('_'))[:-2]) for model_name in df.model_name]
    
    # reorder columns
    columns = ['model_name', 'max_features', 'max_depth', 'pct_wrong_0', 
               'pct_wrong_1', 'pct_wrong_neg1', 'correct_arb', 'correct_arb_neg1', 
               'correct_arb_1', 'correct_arb_0']
    df = df[columns]
    
    # filter for models that are predicting arb when its not happening < 30% of the time
    df = df[df['pct_wrong_0'] < 0.10]
    print('shape after filetering pct_wrong_0:', df.shape)

    # filter for models that predict > 100 correct arb 
    df = df[df['correct_arb'] > 25]
    print('shape after filtering correct_arb:', df.shape)
    
    df = df.sort_values(by=['correct_arb'], ascending=False)
    df.drop_duplicates(subset='model_name', inplace=True)
    print('shape after droping duplicates:', df.shape)
    
    return df

In [137]:
confusion_df = model_confusion2(confusion_dict)

shape before filtering: (1520, 8)
shape after filetering pct_wrong_0: (1256, 10)
shape after filtering correct_arb: (212, 10)
shape after droping duplicates: (16, 10)


In [138]:
confusion_df

Unnamed: 0,model_name,max_features,max_depth,pct_wrong_0,pct_wrong_1,pct_wrong_neg1,correct_arb,correct_arb_neg1,correct_arb_1,correct_arb_0
90,bitfinex_coinbase_pro_ltc_usd,40,25,0.09569,0.275641,0.195514,40429.0,40316.0,113.0,44483.0
122,bitfinex_hitbtc_ltc_usdt,50,17,0.094937,0.452865,0.564148,544.0,248.0,296.0,13299.0
584,bitfinex_coinbase_pro_etc_usd,60,17,0.013515,0.258007,0.928814,438.0,21.0,417.0,35766.0
1477,bitfinex_coinbase_pro_zrx_usd,50,27,0.020483,1.0,0.985453,196.0,196.0,0.0,17646.0
411,bitfinex_hitbtc_eos_usdt,45,25,0.043503,0.473684,0.615385,120.0,20.0,100.0,14885.0
552,coinbase_pro_kraken_etc_usd,50,25,0.038418,0.895246,1.0,119.0,0.0,119.0,2528.0
1447,bitfinex_kraken_etc_usd,50,20,0.062121,0.970389,,92.0,0.0,92.0,619.0
1188,bitfinex_coinbase_pro_bch_usd,55,20,0.0019,0.227273,0.97201,90.0,22.0,68.0,27835.0
938,bitfinex_hitbtc_dash_btc,55,27,0.002006,0.994142,,85.0,0.0,85.0,67651.0
612,bitfinex_coinbase_pro_bch_btc,50,25,0.000947,0.706897,,68.0,0.0,68.0,28497.0


In [33]:
columns = ['pct_wrong_0', 'pct_wrong_1', 'pct_wrong_neg1', 'correct_arb', 
                  'correct_arb_neg1', 'correct_arb_1', 'correct_arb_0']
df = pd.DataFrame(pct_wrong_0_dict).transpose().reset_index()

df = df.rename(columns = {'index': 'model_name', 0:'pct_wrong_0', 1: 'pct_wrong_1', 
                     2: 'pct_wrong_neg1', 3: 'correct_arb', 4: 'correct_arb_neg1', 
                     5: 'correct_arb_1', 6: 'correct_arb_0'})
df

NameError: name 'pct_wrong_0_dict' is not defined

## function for creating df with pnl and confusion matrix features 

In [155]:
def model_confusion(df, confusion_dict):
    
    # create a copy of df to not overwrite original
    df = df.copy()
    
    line = '-------'
    feature_dict = {}
    model_name_list = []
    
    # iterate through all models
    for i in range(len(df)):
        
        # define model name
        model_name = (df.ex_tp.iloc[i] + '_' + str(df.max_features.iloc[i]) 
                      + '_' + str(df.max_depth.iloc[i]) + '_' + str(df.n_estimators.iloc[i]))
        model_name_list.append(model_name)
        
        # create confusion matrix for specific model
        conf_mat = pd.read_json(confusion_dict[model_name])

        #########################################################
        ############## create confusion features ################
        #########################################################
        
        # confusion matrix has -1, 0, 1 predictions
        if 'Predicted 1' in conf_mat.columns and 'Predicted -1' in conf_mat.columns:

            # % incorrect predictions for 0, 1, -1
            pct_wrong_0 = (conf_mat['Predicted 0'].loc[0] + 
                           conf_mat['Predicted 0'].loc[2])/conf_mat['Predicted 0'].sum()
            pct_wrong_1 = (conf_mat['Predicted 1'].loc[0] + 
                           conf_mat['Predicted 1'].loc[1])/conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = (conf_mat['Predicted -1'].loc[1] + 
                               conf_mat['Predicted -1'].loc[2])/conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[2]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]

        # confusion matrix has 0, 1 predictions
        elif 'Predicted 1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[1] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = conf_mat['Predicted 1'].loc[0] / conf_mat['Predicted 1'].sum()
            pct_wrong_neg1 = np.nan

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = 0

            # total number correct arbitrage preds (1)
            correct_arb_1 = conf_mat['Predicted 1'].loc[1]
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[0]
        
        # confusion matrix has -1, 0 predictions
        elif 'Predicted -1' in conf_mat.columns:

            pct_wrong_0 = conf_mat['Predicted 0'].loc[0] / conf_mat['Predicted 0'].sum()
            pct_wrong_1 = np.nan
            pct_wrong_neg1 = conf_mat['Predicted -1'].loc[1] / conf_mat['Predicted -1'].sum()

            # total number correct arbitrage preds (-1)
            correct_arb_neg1 = conf_mat['Predicted -1'].loc[0]

            # total number correct arbitrage preds (1)
            correct_arb_1 = 0
            
            # total number correct arbitrage preds (-1) + (1)
            correct_arb = correct_arb_neg1 + correct_arb_1

            # total number correct no arbitrage preds (0)
            correct_arb_0 = conf_mat['Predicted 0'].loc[1]
        
        # confusion matrix has only 0
        else:
            pct_wrong_0 = 0
            pct_wrong_1 = 0
            pct_wrong_neg1 = 0
            correct_arb = 0
            correct_arb_neg1 = 0
            correct_arb_1 = 0
            correct_arb_0 = 0

        
        # add confusion features to dict
        feature_list = [correct_arb, pct_wrong_0, pct_wrong_1, pct_wrong_neg1, 
                        correct_arb_neg1, correct_arb_1, correct_arb_0]
        feature_dict[model_name] = feature_list

    # create a df from the new features
    columns = ['correct_arb', 'pct_wrong_0', 'pct_wrong_1', 'pct_wrong_neg1', 
                'correct_arb_neg1', 'correct_arb_1', 'correct_arb_0']
    df2 = pd.DataFrame(feature_dict).transpose().reset_index()
    df2 = df2.rename(columns = {'index': 'model_name', 0: 'correct_arb', 1:'pct_wrong_0', 
                                2: 'pct_wrong_1', 3: 'pct_wrong_neg1', 
                                4: 'correct_arb_neg1', 5: 'correct_arb_1', 
                                6: 'correct_arb_0'})
    
    # merge new features with performance df
    df['model_name'] = model_name_list
    print(df.shape, df2.shape)
    df = df.merge(df2, on='model_name').drop(columns = 'model_name')
    print('shape after merge:', df.shape)

    # filter for models that are predicting arb when its not happening < 15% of the time
    df = df[df['pct_wrong_0'] < 0.15]
    print('shape after filetering pct_wrong_0:', df.shape)

    # filter for models that predict > 25 correct arb 
    df = df[df['correct_arb'] > 25]
    print('shape after filtering correct_arb:', df.shape)
    
    # filter for models that make > 0.20% profit
    df = df[df['pct_profit_mean'] > 0.2]
    print('shape after filtering pct_profit_mean:', df.shape)
    
    df = df.sort_values(by=['correct_arb'], ascending=False)
    df2 = df.drop_duplicates(subset='ex_tp')
    print('shape after droping duplicates:', df2.shape)
    
    
    return df, df2

In [157]:
df, df2 = model_confusion(perf_df, confusion_dict)
df2

(76, 6) (76, 8)
shape after merge: (76, 12)
shape after filetering pct_wrong_0: (67, 12)
shape after filtering correct_arb: (15, 12)
shape after filtering pct_profit_mean: (10, 12)
shape after droping duplicates: (10, 12)


Unnamed: 0,ex_tp,max_features,max_depth,pct_profit_mean,pct_profit_median,correct_arb,pct_wrong_0,pct_wrong_1,pct_wrong_neg1,correct_arb_neg1,correct_arb_1,correct_arb_0
7,bitfinex_coinbase_pro_ltc_usd,40,25,1.80214,1.779046,40429.0,0.09569,0.275641,0.195514,40316.0,113.0,44483.0
24,bitfinex_hitbtc_ltc_usdt,60,17,0.377841,0.331202,486.0,0.098023,0.446429,0.566038,207.0,279.0,13370.0
6,bitfinex_coinbase_pro_etc_usd,45,17,1.815758,1.108093,412.0,0.014133,0.201183,0.927083,7.0,405.0,35994.0
20,coinbase_pro_gemini_ltc_btc,50,27,0.602781,0.700844,187.0,0.131705,0.410714,0.379032,154.0,33.0,9810.0
28,bitfinex_gemini_ltc_btc,50,20,0.316826,0.135157,151.0,0.140039,0.614035,0.540773,107.0,44.0,9678.0
23,gemini_hitbtc_ltc_btc,55,17,0.477836,0.479268,123.0,0.134258,0.404255,0.388889,11.0,112.0,9866.0
3,bitfinex_coinbase_pro_bch_usd,40,17,2.722728,0.153628,78.0,0.002246,0.180723,0.944134,10.0,68.0,28436.0
22,bitfinex_hitbtc_eos_usdt,40,17,0.592566,0.496813,63.0,0.046868,0.52,0.833333,3.0,60.0,14927.0
2,bitfinex_coinbase_pro_bch_btc,40,27,8.762994,10.788925,57.0,0.001325,0.22973,,0.0,57.0,28644.0
1,coinbase_pro_hitbtc_bch_btc,60,17,9.999205,11.869496,54.0,0.001069,,0.052632,54.0,0.0,28975.0


In [158]:
perf_df

Unnamed: 0,ex_tp,max_features,max_depth,pct_profit_mean,pct_profit_median
0,bitfinex_coinbase_pro_xrp_usd,50,20,11.721898,11.721898
0,coinbase_pro_hitbtc_bch_btc,60,17,9.999205,11.869496
1,bitfinex_coinbase_pro_bch_btc,40,27,8.762994,10.788925
0,bitfinex_coinbase_pro_bch_usd,40,17,2.722728,0.153628
0,bitfinex_coinbase_pro_btc_usd,40,20,2.490334,2.051365
0,bitfinex_coinbase_pro_eth_usd,60,17,1.875132,0.795566
2,bitfinex_coinbase_pro_etc_usd,45,17,1.815758,1.108093
1,bitfinex_coinbase_pro_ltc_usd,40,25,1.80214,1.779046
1,coinbase_pro_hitbtc_eth_btc,40,27,1.602466,1.602466
1,bitfinex_hitbtc_eth_btc,60,27,1.590873,1.890326
