In [1]:
import bmll2 as b2
from bmll2 import reference, Security, NormalisedSecurity, SparkHelper, get_market_data, get_market_data_range, VenueMarketError, get_market_tables, save_spark_dataframe, load_spark_dataframe
b2.get_file('modules/auxiliary_functions.py')

'auxiliary_functions.py'

In [4]:
import auxiliary_functions as af

import random
import math
import pandas as pd
import numpy as np
from pandas import StringDtype
import gc

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import LogFormatterSciNotation
from scipy.stats import linregress
from scipy.optimize import curve_fit
from scipy.stats import t

## Data processing for plotting the convex post execution decay

In [6]:
ticker = 'GRT'

In [7]:
b2.get_file(f'top_100(Volume)/{ticker}.csv')
stock_data = pd.read_csv(f'{ticker}.csv', parse_dates = ['DateTime', 'Date'])
stock_data = stock_data.rename(columns = {'Ticker' : 'RIC'})
stock_data = stock_data.sort_values(['DateTime', 'ExchangeSequenceNo'])


In [8]:
#b2.get_file('test_data/homo_4_immediate(100).csv')
#impact_data = pd.read_csv('homo_4_immediate(100).csv', parse_dates = ['Date', 'Start time', 'End time'])

b2.get_file('test_data/metaorder_data_homogenous_4.csv')
impact_data = pd.read_csv('metaorder_data_homogenous_4.csv', parse_dates = ['Date', 'Start time', 'End time'])

stock_AD_data = impact_data[impact_data['RIC'] == ticker][['RIC', 'Date', '20 AD volatility', '20 AD volume']]
stock_AD_data = stock_AD_data.drop_duplicates(subset = ['Date']).reset_index(drop = True)

del impact_data
gc.collect()


31094

In [9]:
def impact_df_decay(metaorders_list, timing_method = 'immediate'): 
    
    num_trades = len(metaorders_list)
    
    features = pd.DataFrame(columns = ['RIC', 'Date', 'Start time', 'End time', 'daily volume', 'intraday volatility',
                                       'number child orders', 'volume traded', 'trade sign', 'impact(simple)', 'Mid-price before', 'Mid-price after(immediate)'])
    for i in range(num_trades):
        metaorder = metaorders_list[i]

        if metaorder.empty:
            continue

        intention  = metaorder.iloc[0]['Trade Sign']
        ave_impact = af.impact(metaorder, timing_method = timing_method, impact_method = 'simple')
        n          = metaorder.shape[0]
        volume     = sum(metaorder.loc[:, 'Volume'])

        features.at[i, 'RIC']                       = metaorder['RIC'].iloc[0]
        features.at[i, 'Date']                      = metaorder['Date'].iloc[0]
        features.at[i, 'Start time']                = metaorder['DateTime'].iloc[0]
        features.at[i, 'End time']                  = metaorder['DateTime'].iloc[-1]
        features.at[i, 'daily volume']              = metaorder['Daily Volume'].iloc[0]
        features.at[i, 'intraday volatility']       = metaorder['Daily Volatility'].iloc[0]
        features.at[i, 'number child orders']       = n
        features.at[i, 'volume traded']             = volume
        features.at[i, 'trade sign']                = intention
        features.at[i, 'impact(simple)']            = ave_impact
        features.at[i, 'Mid-price before']          = metaorder['Mid-price before'].iloc[0]
        features.at[i, 'Mid-price after(immediate)'] = metaorder['Mid-price after(immediate)'].iloc[-1]
        

    return features


In [10]:
%%time
stock_mid_prices = get_market_data_range('XJSE', start_date = '2023-01-03', end_date = '2025-12-31',
                                       table_name = 'l1', ticker = ticker, df_engine = 'polars',
                                      columns = ['Ticker', 'TradeDate', 'LocalTimestamp', 'BidPrice1', 'BidQuantity1', 'AskPrice1', 'AskQuantity1',
                                                 'ExchangeSequenceNo', 'MarketState'])
stock_mid_prices = stock_mid_prices.to_pandas()#toPandas()
stock_mid_prices = stock_mid_prices[stock_mid_prices['MarketState'] == 'CONTINUOUS_TRADING']
stock_mid_prices = stock_mid_prices.sort_values(['LocalTimestamp', 'ExchangeSequenceNo'])
stock_mid_prices['Mid-price'] = (stock_mid_prices['BidPrice1'] + stock_mid_prices['AskPrice1']) / 2
stock_mid_prices.head()

# I can do 3 entire years worth of l1 data at a time for GRT. Takes 60 to load
# I can do 3 entire years worth of l1 data at a time for GFI. Takes 2.5 minutes to load


CPU times: user 29.1 s, sys: 11.7 s, total: 40.8 s
Wall time: 46.6 s


Unnamed: 0,Ticker,TradeDate,LocalTimestamp,BidPrice1,BidQuantity1,AskPrice1,AskQuantity1,ExchangeSequenceNo,MarketState,Mid-price
5803074,GRT,2023-01-03,2023-01-03 09:00:08.505846,1416.0,1677,1475.0,5000,34311,CONTINUOUS_TRADING,1445.5
5803075,GRT,2023-01-03,2023-01-03 09:00:08.514672,1416.0,1677,1474.0,4066,34316,CONTINUOUS_TRADING,1445.0
5803076,GRT,2023-01-03,2023-01-03 09:00:08.514797,1416.0,1677,1474.0,6766,34320,CONTINUOUS_TRADING,1445.0
5803077,GRT,2023-01-03,2023-01-03 09:00:08.652716,1416.0,1677,1474.0,4066,34422,CONTINUOUS_TRADING,1445.0
5803078,GRT,2023-01-03,2023-01-03 09:00:08.723150,1416.0,1677,1473.0,4074,34445,CONTINUOUS_TRADING,1444.5


In [11]:
N                   = 20
trader_distribution = 'power'
alpha               = 2
identifier          = f'{trader_distribution}_{N}'

In [12]:
%%time
impact_data = []
for date, day_D in stock_data.groupby('Date', sort = True):
        print(date)

        trades = day_D.loc[day_D['Price'] != 0]
    
        N = N
        f = af.trader_participation(N = N, method = trader_distribution, alpha = alpha, f_min = 1, f_max = trades.shape[0], seed = 1)
        c = af.cumulative_probs(f)

        if trades.empty:
            continue

        output = af.orders(N = N, trades = trades, cumulative_probs = c)
        for n in range(N):
            
            trader_n_trades = trades.iloc[output[n], ]
           
            if trader_n_trades.empty:
                continue

            trader_n_metaorders = af.metaorders(trader_n_trades)
            
            if len(trader_n_metaorders) < 10:
                continue

            trader_n_features = impact_df_decay(trader_n_metaorders, timing_method = 'immediate')
            trader_n_features['20 AD volatility'] = stock_AD_data[stock_AD_data['Date'] == date]['20 AD volatility'].iloc[0] 
            trader_n_features['20 AD volume'] = stock_AD_data[stock_AD_data['Date'] == date]['20 AD volume'].iloc[0]
            
            if not trader_n_features.empty and not trader_n_features.isna().all().all():
                impact_data.append(trader_n_features)



impact_data   = pd.concat(impact_data, ignore_index = True)
exclude_cols = ['RIC', 'Date', 'Start time', 'End time']
numeric_cols = [col for col in impact_data.columns if col not in exclude_cols]
impact_data[numeric_cols] = impact_data[numeric_cols].apply(pd.to_numeric, errors = 'coerce')

impact_data['Date']          = pd.to_datetime(impact_data['Date']).dt.normalize()
impact_data['Start time']    = pd.to_datetime(impact_data['Start time'], format = 'mixed')
impact_data['End time']      = pd.to_datetime(impact_data['End time'], format = 'mixed')
impact_data['duration(min)'] = (impact_data['End time'] - impact_data['Start time']).dt.total_seconds() / 60
durations_GRT               = impact_data['duration(min)']

impact_data.to_csv(f'{ticker}_{identifier}_decay.csv', index = False)
b2.put_file(f'{ticker}_{identifier}_decay.csv', 'test_data')

# takes about 6 minutes to run

2023-01-03 00:00:00
2023-01-04 00:00:00
2023-01-05 00:00:00
2023-01-06 00:00:00
2023-01-09 00:00:00
2023-01-10 00:00:00
2023-01-11 00:00:00
2023-01-12 00:00:00
2023-01-13 00:00:00
2023-01-16 00:00:00
2023-01-17 00:00:00
2023-01-18 00:00:00
2023-01-19 00:00:00
2023-01-20 00:00:00
2023-01-23 00:00:00
2023-01-24 00:00:00
2023-01-25 00:00:00
2023-01-26 00:00:00
2023-01-27 00:00:00
2023-01-30 00:00:00
2023-01-31 00:00:00
2023-02-01 00:00:00
2023-02-02 00:00:00
2023-02-03 00:00:00
2023-02-06 00:00:00
2023-02-07 00:00:00
2023-02-08 00:00:00
2023-02-09 00:00:00
2023-02-10 00:00:00
2023-02-13 00:00:00
2023-02-14 00:00:00
2023-02-15 00:00:00
2023-02-16 00:00:00
2023-02-17 00:00:00
2023-02-20 00:00:00
2023-02-21 00:00:00
2023-02-22 00:00:00
2023-02-23 00:00:00
2023-02-24 00:00:00
2023-02-27 00:00:00
2023-02-28 00:00:00
2023-03-01 00:00:00
2023-03-02 00:00:00
2023-03-03 00:00:00
2023-03-06 00:00:00
2023-03-07 00:00:00
2023-03-08 00:00:00
2023-03-09 00:00:00
2023-03-10 00:00:00
2023-03-13 00:00:00


In [13]:
decay_df = []

z_max = 2
z_grid = np.linspace(1, z_max, 200)

for date, day_D in impact_data.groupby('Date', sort = True):

    print(date)
    day_D = day_D.sort_values(['Start time'])
    num_metaorders = day_D.shape[0]

    for i in range(num_metaorders):

        if impact_data['number child orders'].iloc[i] < 10:
            continue

        else:
            delta_t = pd.to_timedelta(z_grid * impact_data['duration(min)'].iloc[i], unit = 'min')
            t_grid  = impact_data['Start time'].iloc[i] + delta_t
    
            grid_df = pd.DataFrame({'query time': t_grid.astype('datetime64[us]'), 
                                    'z': z_grid}).sort_values('query time')
            
            candidate_df = stock_mid_prices[(stock_mid_prices['LocalTimestamp'] >= grid_df['query time'].iloc[0]) & 
                                          (stock_mid_prices['LocalTimestamp'] <= grid_df['query time'].iloc[-1])]
            merged = pd.merge_asof(grid_df, candidate_df, right_on = 'LocalTimestamp', left_on = 'query time', direction = 'backward', allow_exact_matches = True)
            merged = merged.dropna(subset = ['BidPrice1', 'AskPrice1'])
    
            mid_prices_array     = np.full(merged.shape[0] + 2, np.nan)
            mid_prices_array[0]  = impact_data['Mid-price before'].iloc[i]
            mid_prices_array[1]  = impact_data['Mid-price after(immediate)'].iloc[i]
            mid_prices_array[2:] = merged['Mid-price']
    
            z                    = np.full(merged.shape[0] + 2, np.nan)
            z[0]                 = 0
            z[1]                 = 1
            z[2:]                = merged['z']
            
            Q                    = impact_data['volume traded'].iloc[i]
            sigma                = impact_data['20 AD volatility'].iloc[i]
            volume               = impact_data['20 AD volume'].iloc[i] 
            impact               = impact_data['trade sign'].iloc[i] * (np.log(mid_prices_array) - np.log(mid_prices_array[0]))
            scaled_impact_decay  = impact_data['trade sign'].iloc[i] * (np.log(mid_prices_array) - np.log(mid_prices_array[0])) / (np.sqrt(Q) * sigma)
            scaled_impact_decay_vol  = impact_data['trade sign'].iloc[i] * (np.log(mid_prices_array) - np.log(mid_prices_array[0])) / (np.sqrt(Q / volume) * sigma)
    
            decay_df_day = pd.DataFrame({'Date': date, 'z': z, 'Q': Q, 'sigma': sigma, 'impact': impact, 'scaled impact': scaled_impact_decay,
                                         'scaled impact(with volume)': scaled_impact_decay_vol})
            decay_df.append(decay_df_day)
        
decay_df = pd.concat(decay_df, ignore_index = True)


decay_df.to_csv(f'{ticker}_impact_profile(post)_{identifier}.csv', index = False)
b2.put_file(f'{ticker}_impact_profile(post)_{identifier}.csv', 'test_data')        

# takes about 8 minutes

2023-01-03 00:00:00
2023-01-04 00:00:00
2023-01-05 00:00:00
2023-01-06 00:00:00
2023-01-09 00:00:00
2023-01-10 00:00:00
2023-01-11 00:00:00
2023-01-12 00:00:00
2023-01-13 00:00:00
2023-01-16 00:00:00
2023-01-17 00:00:00
2023-01-18 00:00:00
2023-01-19 00:00:00
2023-01-20 00:00:00
2023-01-23 00:00:00
2023-01-24 00:00:00
2023-01-25 00:00:00
2023-01-26 00:00:00
2023-01-27 00:00:00
2023-01-30 00:00:00
2023-01-31 00:00:00
2023-02-01 00:00:00
2023-02-02 00:00:00
2023-02-03 00:00:00
2023-02-06 00:00:00
2023-02-07 00:00:00
2023-02-08 00:00:00
2023-02-09 00:00:00
2023-02-10 00:00:00
2023-02-13 00:00:00
2023-02-14 00:00:00
2023-02-15 00:00:00
2023-02-16 00:00:00
2023-02-17 00:00:00
2023-02-20 00:00:00
2023-02-21 00:00:00
2023-02-22 00:00:00
2023-02-23 00:00:00
2023-02-24 00:00:00
2023-02-27 00:00:00
2023-02-28 00:00:00
2023-03-01 00:00:00
2023-03-02 00:00:00
2023-03-03 00:00:00
2023-03-06 00:00:00
2023-03-07 00:00:00
2023-03-08 00:00:00
2023-03-09 00:00:00
2023-03-10 00:00:00
2023-03-13 00:00:00
