In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import cm
import json
from scipy.stats import kurtosis, skew, ks_2samp
from statsmodels.tsa.stattools import acf
turquoise_datetime_format = "%d-%b-%Y %H:%M:%S.%f"

## Functions

In [8]:
def get_synthetic_market_data(data_path, tick_size=.02):
    df_lob_l1 = pd.read_csv('{0}/lob_l1.csv'.format(data_path)).set_index('time')
    df_lob_l1.index = pd.to_datetime(df_lob_l1.index, format=turquoise_datetime_format)

    l1B = df_lob_l1[df_lob_l1['side']  == 'B']
    l1S = df_lob_l1[df_lob_l1['side']  == 'S']
    
    mp = .5 * l1B['prc'] + .5 * l1S['prc']
    spread = l1S['prc'] - l1B['prc']
    spread = np.round(spread.fillna(method='ffill').fillna(method='bfill') / tick_size).dropna().astype(int)
    
    orders = pd.read_csv('{0}/orders.csv'.format(data_path)).set_index('time')
    orders.index = pd.to_datetime(orders.index, format=turquoise_datetime_format)
    
    b0 = l1B['prc'].shift(1)
    a0 = l1S['prc'].shift(1)
    depth_b = np.round((b0.reindex(orders.index) - orders['prc']) / tick_size)
    depth_a = np.round((orders['prc'] - a0.reindex(orders.index)) / tick_size)

    orders.loc[:,'depth'] = ((orders['side'] == 'B') * depth_b + (orders['side'] == 'S') * depth_a).values
    
    trades = pd.read_csv('{0}/trades.csv'.format(data_path)).set_index('time')
    trades.index = pd.to_datetime(trades.index, format=turquoise_datetime_format)
    md = {'l1B': l1B, 'l1S': l1S, 'mp': mp, 'spread': spread, 'orders': orders, 'trades': trades, 'a0': a0, 'b0': b0}
    return md

def get_fv_mv_value(data_path):
    fv = pd.read_csv('{0}/fundamental_value.csv'.format(data_path)).set_index('timestamp')
    fv.index = pd.to_datetime(fv.index, format=turquoise_datetime_format).floor('S')
    mv = pd.read_csv('{0}/momentum_value.csv'.format(data_path)).set_index('timestamp')
    mv.index = pd.to_datetime(mv.index, format=turquoise_datetime_format).floor('S')
    return fv.squeeze(), mv.squeeze()

# Distance Calculation
# Volatility
def sec_volatility_diff(ret1, ret2):
    return np.abs(ret1.std() - ret2.std())
# Autocorrelation diff
# For return first order acf, use small lags
def acf_diff(data1, data2, nlags):
    acf1 = acf(data1, nlags=nlags, fft=True)
    acf2 = acf(data2, nlags=nlags, fft=True)
    diff = acf1 - acf2
    return np.sum(np.abs(diff)) / (len(diff)-1)

def compute_distance(simulated, historical, ret1_lag=5, ret2_lag=20, bs_lag=10, mo_lag=10):
    ret_s, ret_h = simulated['ret'], historical['ret']
    vol_diff = sec_volatility_diff(ret_s, ret_h)
    cdf_ks = ks_2samp(ret_s, ret_h).statistic
    ret1_acf_diff = acf_diff(ret_s, ret_h, nlags=ret1_lag)
    ret2_acf_diff = acf_diff(ret_s**2, ret_h**2, nlags=ret2_lag)
    bs_series_s, bs_series_h = simulated['bs_series'], historical['bs_series']
    mo_series_s, mo_series_h = simulated['mo_series'], historical['mo_series']
    bs_acf_diff = acf_diff(bs_series_s, bs_series_h, nlags=bs_lag)
    mo_acf_diff = acf_diff(mo_series_s, mo_series_h, nlags=mo_lag)
    return vol_diff, cdf_ks, ret1_acf_diff, ret2_acf_diff, bs_acf_diff, mo_acf_diff


# High level config

In [9]:
symbol = 'VODl'
mic = 'XLON'
tick_size = .02
datestr = '20210128'
start = pd.to_datetime(datestr + ' 08:02:00')
end = pd.to_datetime(datestr + ' 16:29:00')
print(datestr)

20210128


## Historical data

In [10]:
from analytics.trqx.turquoise_exchange import get_trades, get_orders, get_market_data
cloud_path = "D:/Simudyne Limited/Data - LSE Market Data"
lit_path = '{0}/{1}'.format(cloud_path,  'L2/data-x.londonstockexchange.com/data-x/TRQX')
lob_path = '{0}/{1}'.format(cloud_path,  'LOB')
md = get_market_data(lit_path, lob_path, symbol, mic, pd.to_datetime(datestr), tick_size, include_l2=True)
l1 = md['l1']
h_depth = l1[l1['side'] == 'S'].depth + l1[l1['side'] == 'B'].depth
b0 = l1[l1['side'] == 'B'].prc.round(2)
a0 = l1[l1['side'] == 'S'].prc.round(2)
raw_mid = ((a0 + b0) / 2.).dropna().loc[start:end]
min_mid = raw_mid.resample('1min').apply(lambda i: i[-1] if len(i) > 0 else np.nan).fillna(method='ffill').dropna()
mid = raw_mid.resample('1S').apply(lambda i: i[-1] if len(i) > 0 else np.nan).fillna(method='ffill').dropna()
ret_h = (mid.values[1:] / mid.values[:-1] - 1) * 10000
market_orders = md['market_orders']
mo_sign = market_orders['SIDE'].apply(lambda x: 1 if x=='Buy' else -1)
h_orders = md['orders']
real_orders = h_orders.dropna().copy()
real_orders = real_orders[(real_orders['EXECTYPE']=='Insert') | (real_orders['EXECTYPE']=='Cancel')] # whether includr 'M'
real_orders['tmp1'] = real_orders['SIDE'].apply(lambda x: 1 if x=='Buy' else -1)
real_orders['tmp2'] = real_orders['EXECTYPE'].apply(lambda x: -1 if x=='Cancel' else 1)
real_orders['pressure'] = real_orders['tmp1'] * real_orders['tmp2']
real_p_series = real_orders['pressure']
historical = {'ret': ret_h, 'bs_series': real_p_series, 'mo_series': mo_sign}

## Simulated data

In [14]:
data_path = 'MultipleRun/Results/run_00000'
def get_simulated(data_path, tick_size, start, end):
    md = get_synthetic_market_data(data_path, tick_size=tick_size)
    l1B, l1S, raw_mp, spread, orders, trades = md['l1B'], md['l1S'], md['mp'], md['spread'], md['orders'], md['trades']
    raw_mp = raw_mp.dropna().loc[start:end]
    min_mp = raw_mp.resample('1min').apply(lambda i: i[-1] if len(i) > 0 else np.nan).fillna(method='ffill').dropna()
    mp = raw_mp.resample('1S').apply(lambda i: i[-1] if len(i) > 0 else np.nan).fillna(method='ffill').dropna()
    ret_s = (mp.values[1:] / mp.values[:-1] - 1) * 10000
    # fv, mv = get_fv_mv_value(data_path)
    # distortion = mp - fv
    # Orders autocorrelation analysis
    orders_data = orders.copy()
    orders_data['tmp1'] = orders_data['side'].apply(lambda x: 1 if x=='B' else -1)
    orders_data['tmp2'] = orders_data['OrdType'].apply(lambda x: -1 if x=='C' else 1)
    orders_data['pressure'] = orders_data['tmp1'] * orders_data['tmp2']

    p_series = orders_data[(orders_data['pressure']!=0) & (orders_data['OrdType']!='M')]['pressure']
    # market order autocorrelation
    mo_series_s = orders_data[orders_data['OrdType'] == 'M']['pressure']
    simulated = {'ret': ret_s, 'bs_series': p_series, 'mo_series': mo_series_s}
    return simulated

In [15]:
for i in range(10):
    data_path = 'MultipleRun/Results/run_0000' + str(i)
    print(data_path)
    simulated = get_simulated(data_path, tick_size, start, end)
    dist = compute_distance(simulated, historical, ret1_lag=3, ret2_lag=10, bs_lag=5, mo_lag=10)
    vol_diff, cdf_ks_dist, ret1_acf_diff, ret2_acf_diff, bs_acf_diff, mo_acf_diff = dist
    print("vol_diff: %.3f, cdf_ks_dist: %.3f, ret1_acf_diff: %.3f, ret2_acf_diff: %.3f, bs_acf_diff: %.3f, mo_acf_diff: %.3f"\
          %(vol_diff, cdf_ks_dist, ret1_acf_diff, ret2_acf_diff, bs_acf_diff, mo_acf_diff))
    print("Total distance: %.10f"%np.sum(dist))

MultipleRun/Results/run_00000
vol_diff: 0.058, cdf_ks_dist: 0.092, ret1_acf_diff: 0.059, ret2_acf_diff: 0.086, bs_acf_diff: 0.056, mo_acf_diff: 0.067
Total distance: 0.4187617096
MultipleRun/Results/run_00001
vol_diff: 0.046, cdf_ks_dist: 0.090, ret1_acf_diff: 0.054, ret2_acf_diff: 0.086, bs_acf_diff: 0.054, mo_acf_diff: 0.087
Total distance: 0.4183763499
MultipleRun/Results/run_00002
vol_diff: 0.090, cdf_ks_dist: 0.088, ret1_acf_diff: 0.048, ret2_acf_diff: 0.086, bs_acf_diff: 0.055, mo_acf_diff: 0.081
Total distance: 0.4494858584
MultipleRun/Results/run_00003
vol_diff: 0.188, cdf_ks_dist: 0.095, ret1_acf_diff: 0.059, ret2_acf_diff: 0.088, bs_acf_diff: 0.056, mo_acf_diff: 0.079
Total distance: 0.5645711147
MultipleRun/Results/run_00004
vol_diff: 0.061, cdf_ks_dist: 0.092, ret1_acf_diff: 0.052, ret2_acf_diff: 0.080, bs_acf_diff: 0.060, mo_acf_diff: 0.067
Total distance: 0.4132819778
MultipleRun/Results/run_00005
vol_diff: 0.025, cdf_ks_dist: 0.088, ret1_acf_diff: 0.053, ret2_acf_diff: 0