In [7]:
import pandas as pd
import numpy as np
from scipy import signal, stats
from plotly import express as plx
import json
import csv

from sklearn import linear_model

In [8]:
# !. ../secrets.sh && cd '../build' && ./scout_exe

In [9]:
COINBASE_TICKERS = '../data/coinbase_tickers.txt'
COINBASE_BOOKS = '../data/coinbase_books.txt'

KRAKEN_TICKERS = '../data/kraken_tickers.txt'
KRAKEN_BOOKS = '../data/kraken_books.txt'


DEPTH = 100
SCALE = int(1e8)

Explore the lag within the books as they update much faster than tickers do

In [10]:
# coinbase

cb_book = pd.read_csv(COINBASE_BOOKS, header=None)
cb_book.columns = ['ticker', 'timestamp', 'type', 'price', 'quantity']
cb_bids = cb_book[cb_book.type == 'bid'].drop('type',axis=1)
cb_asks = cb_book[cb_book.type == 'offer'].drop('type',axis=1)

# kraken

kk_book = pd.read_csv(KRAKEN_BOOKS, header=None)
kk_book.columns = ['ticker', 'timestamp', 'type', 'price', 'quantity']
kk_book = kk_book[kk_book['timestamp'] != 0]
kk_bids = kk_book[kk_book.type == 'bid'].drop('type',axis=1)
kk_asks = kk_book[kk_book.type == 'ask'].drop('type',axis=1)


print("size of cb_bids", len(cb_bids))
print("size of cb_asks", len(cb_asks))
print("size of kk_bids", len(kk_bids))
print("size of kk_asks", len(kk_asks))

signal_ind = {'cb_bids': 0, 'cb_asks': 1, 'kk_bids': 2, 'kk_asks': 3}
books = [cb_bids, cb_asks, kk_bids, kk_asks]

size of cb_bids 76630
size of cb_asks 88000
size of kk_bids 99656
size of kk_asks 93367


In [5]:
# use to test code so it doesn't take forver to run
for sig,ii in signal_ind.items():
    books[ii] = books[ii].iloc[0:2000,:]
books

[         ticker      timestamp     price       quantity
 0     MATIC-USD  1735196588705  49500000     2920000000
 1     MATIC-USD  1735196588705  49490000   101010000000
 2     MATIC-USD  1735196588705  49480000   202090000000
 3     MATIC-USD  1735196588705  49470000   188940000000
 4     MATIC-USD  1735196588705  49460000   505380000000
 ...         ...            ...       ...            ...
 6368  MATIC-USD  1735196749729  48170000   587690000000
 6369  MATIC-USD  1735196749729  48150000              0
 6371  MATIC-USD  1735196750429  48480000   100390000000
 6372  MATIC-USD  1735196750429  48470000      290000000
 6375  MATIC-USD  1735196750479  48180000  2071570000000
 
 [2000 rows x 4 columns],
          ticker      timestamp      price       quantity
 813   MATIC-USD  1735196588705   49510000   103450000000
 814   MATIC-USD  1735196588705   49520000   906500000000
 815   MATIC-USD  1735196588705   49530000   206519999999
 816   MATIC-USD  1735196588705   49550000  100917000000

In [11]:
best_signal = [pd.DataFrame(columns=['timestamp', 'price', 'quantity']) for x in signal_ind.keys()]
wavg_signal = [pd.DataFrame(columns=['timestamp', 'price', 'total_volume']) for x in signal_ind.keys()]
price_signal = [pd.DataFrame(columns=['timestamp', 'std', 'median', 'spread']) for x in signal_ind.keys()]

for sig, ii in signal_ind.items():
    is_bid = ii % 2 == 0
    allts = np.sort(books[ii]['timestamp'].unique())
    firstts = allts[0]
    at_firstts = books[ii]['timestamp'] == firstts

    curbook = books[ii][at_firstts].copy()
    # complete_book = pd.DataFrame()
    # this can probs easily be solved by using a groupby, im so dumb
    for ts in allts:
        at_ts = books[ii]['timestamp'] == ts
        sig_at_ts = books[ii][at_ts]

        merged = pd.merge(sig_at_ts[['price', 'quantity']], curbook, on='price', how='left', suffixes=('_old', ''))
        merged['quantity']=merged['quantity'].fillna(merged['quantity_old'])
        curbook = merged.drop('quantity_old', axis=1)
        curbook['timestamp'] = ts
        curbook = curbook[curbook['quantity'] != 0]
        curbook = curbook.sort_values('price', ascending=(not is_bid))
        
        # complete_book = pd.concat([complete_book,curbook.iloc[:DEPTH]])

        if(not curbook['price'].empty):
            minbook = curbook.iloc[0:DEPTH, :]
            wavg_signal[ii].loc[len(wavg_signal[ii])] = [ts, 
                                                         (minbook['price'] * minbook['quantity']).sum() / minbook['quantity'].sum(), 
                                                         minbook['quantity'].sum()]
            best_signal[ii].loc[len(best_signal[ii])] = [ts, curbook['price'].iloc[0], curbook['quantity'].iloc[0]]
            price_signal[ii].loc[len(price_signal[ii])] = [ts, 
                                                           np.std(minbook['price']),
                                                           minbook['price'].iloc[int(len(minbook['price'])/2)],
                                                           np.abs(np.min(minbook['price']) - np.max(minbook['price']))]


    wavg_signal[ii] = wavg_signal[ii][wavg_signal[ii]['price'] > 0]
# best_signal
# wavg_signal
# price_signal

KeyboardInterrupt: 

In [51]:
def ffill_df(dfs, sigkeys):
    ffilled_dfs = []
    for sig,df in zip(sigkeys,dfs):
        df.index = pd.to_datetime(df['timestamp'], unit='ms')
        df = df.drop(['timestamp'], axis=1)
        df.index.name = 'date'
        df = df.resample('1ms').ffill() 
        ffilled_dfs.append(df)


    i = 0
    ffilled = ffilled_dfs[i].rename(columns=lambda col: col + "_" + sigkeys[i] if col != "date" else col)
    i += 1
    while i < len(sigkeys):
        current_df = ffilled_dfs[i].rename(columns=lambda col: col + "_" + sigkeys[i] if col != "date" else col)
        ffilled = pd.merge(
            ffilled,
            current_df,
            on='date',
            how='inner'
        )
        i += 1

    ffilled['timestamp'] = ffilled.index.view('int64') / 10**9
    ffilled['timestamp'] = ffilled['timestamp'] - np.min(ffilled['timestamp'])
    return ffilled


best_signal_ffilled = ffill_df(best_signal, list(signal_ind.keys()))
wavg_signal_ffilled = ffill_df(wavg_signal, list(signal_ind.keys()))
price_signal_ffilled = ffill_df(price_signal, list(signal_ind.keys()))

best_signal_ffilled.to_csv('best_signal_ffilled.csv')
wavg_signal_ffilled.to_csv('wavg_signal_ffilled.csv')
price_signal_ffilled.to_csv('price_signal_ffilled.csv')

In [None]:
best_signal_ffilled = pd.read_csv('best_signal_ffilled.csv')
wavg_signal_ffilled = pd.read_csv('wavg_signal_ffilled.csv')
price_signal_ffilled = pd.read_csv('price_signal_ffilled.csv')

In [None]:
## TODO: EVERYTHING ABOVE THIS NEEDS TO BE OPTIMIZED CUZ RN IT CAN'T RUN!!!

# TODO: indicators

 best-ask to best-bid price spread
 ask-bid imbalance (ask_vol - bid_vol) / (ask_vol + bid_vol)

##### for MA, do for both best (best_signal_ffilled) and weighted avg (wavg_signal_ffileed)
- MA (3 ms)
- MA (10 ms)
- MA (100 ms)
- MA (1000 ms)
- EMA (alpha = 0.01)
- EMA (alpha = 0.05)
- EMA (alpha = 0.10)
- EMA (alpha = 0.33)
- EMA (alpha = 0.67)
- EMA (alpha = 0.90)

- kalman filter

*also I have a few ideas for slope interpolation of weighted avg to patch in gaps in noisy best_price, but we will ignore for now*

# Model pseudo code

next, code up logistic regression to tell us if we should trade or not
`LOGISTIC(indicators) => y(t)`
or `LINEAR(indicators) => x(t), calculate z scores => y(t)`

    x(T) = statistically segnificant signals at T timestep(forward filled)
    x(T) is statsitically segniciant at T if X(T) > MEAN + Z*STD  
    where MEAN & STD come from X(T-1), X(T-2), ... X(T-N)

Y = `COINBASE_BEST_ASK(T+(LAG))` < the `KRAKEN_BEST_BID within (T+(LAG), T+(LAG)+(HOLDTIME)]`

then optimize `HOLDTIME` and `Z` & `N` with the objective function (i'm thinking gridsearch to keep it simple)

    OBJ = 0
    WE_TRADE = Y(T) >= 0.5
    if(WE_TRADE): 
      check if ask price @ T+LAG > bid price T1 @ T+LAG to T+LAG+HOLDTIME:
          if so, (OBJ += (ASK_PRICE(T+LAG) - BID_PRICE(T+LAG+T1))
          if not, (OBJ += ASK_PRICE(T+LAG) - BID_PRICE(T+LAG+T1))

    MAX OBJ W.R.T `HOLDTIME` and `Z` & `N`




In [None]:
# ID-ing signiciant signals, ideally use logistical regression for this part
window_size = 30  # law of large numbers
# TODO, not price, its like cb_ask_price
X1 = wavg_signal_ffilled['price'].rolling(window_size)
mu = X1.mean(skipna=True)
std = X1.std(skipna=True)
z_th = 1

zscores = X1.apply(lambda x: stats.zscore(x)[-1] if len(x) == window_size else np.nan)

sig = zscores > z_th # peaks
peaks = wavg_signal_ffilled['price'][sig]


In [None]:

HOLDTIME = 100 # 100 ms
LAG = 100 # 100 ms

X = peaks.iloc[:-(HOLDTIME+LAG)]
peakprice = best_signal_ffilled['kkbidprice'].rolling(HOLDTIME + LAG, min_periods=(HOLDTIME+LAG)).dropna().max()
finalprice = best_signal_ffilled['kkbidprice'].iloc[HOLDTIME+LAG:]

trade_ratio = (peakprice > finalprice).sum() / len(peakprice)
trade_ratio


Make the following two cells into a method with parameters `HOLDTIME` and 

In [None]:
from matplotlib import pyplot as plt
plt.plot(best_signal_ffilled['price_cb_asks'])
plt.plot(best_signal_ffilled['price_cb_bids'])
plt.ylim(0,1)

In [None]:
# plot ask lag
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=best_signal_ffilled['timestamp'], y=best_signal_ffilled['price_' + "cb" + "_asks"]/SCALE, mode='lines', name='Coinbase'))
fig.add_trace(go.Scatter(x=best_signal_ffilled['timestamp'], y=best_signal_ffilled['price_' + "kk" + "_asks"]/SCALE, mode='lines', name='Kraken'))
fig.show()

In [None]:
# plot bids
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=best_signal_ffilled['timestamp'], y=best_signal_ffilled['price_' + "cb" + "_bids"]/SCALE, mode='lines', name='Coinbase'))
fig.add_trace(go.Scatter(x=best_signal_ffilled['timestamp'], y=best_signal_ffilled['price_' + "kk" + "_bids"]/SCALE, mode='lines', name='Kraken'))
fig.show()

In [None]:

import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=midpt_signal['timestamp'], y=midpt_signal['midpt_' + "cb"]/SCALE, mode='lines', name='Coinbase'))
fig.add_trace(go.Scatter(x=midpt_signal['timestamp'], y=midpt_signal['midpt_' + "kk"]/SCALE, mode='lines', name='Kraken'))
fig.show()

In [109]:
def max_delayed_crosscorrelation(s1, s2):
    correlation = signal.correlate(s1, s2, mode='full', method='auto')
    
    max_corr_index = np.argmax(correlation)
    max_corr_value = correlation[max_corr_index]
    
    delay = max_corr_index - (len(s1) - 1)
    
    return max_corr_value, delay

In [None]:
print('asks', max_delayed_crosscorrelation(best_signal_ffilled['price_cb_asks'], best_signal_ffilled['price_kk_asks']))
print('bids', max_delayed_crosscorrelation(best_signal_ffilled['price_cb_bids'], best_signal_ffilled['price_kk_bids']))
print('midpt', max_delayed_crosscorrelation(midpt_signal['midpt_cb'], midpt_signal['midpt_kk']))