In [5]:
import pandas as pd
from datetime import datetime


# Function to convert time strings to datetime objects
def convert_to_datetime(time_str):
    return datetime.strptime(time_str, "%H:%M:%S.%f")

In [6]:
# Load the data
trade_df = pd.read_csv('data/trade.csv')
quote_df = pd.read_csv('data/quote.csv')

In [7]:
# Convert time columns
trade_df['datetime'] = trade_df['time'].apply(convert_to_datetime)
quote_df['datetime'] = quote_df['time'].apply(convert_to_datetime)
trade_df.set_index('datetime', inplace=True)
quote_df.set_index('datetime', inplace=True)

In [8]:
import sys
sys.path.append('/Users/chuanlin/Documents/GitHub/hft-playground')
from hft_play import plot_toolbox

In [9]:
df = quote_df.merge(trade_df, how='outer', left_index=True, right_index=True)
df.hplot(
    ['bid_price', 'ask_price', 'price'],
    output_name='px.html'
)
# check the file px html

In [10]:
# check time diff

def cal_time_diff(df):
    """
    Calculate time difference between rows, also consider noon break
    """
    df = df.copy()
    df['time_diff'] = 0

    # morning trading
    trading_morning = (
        (df.index.time >= datetime.strptime('09:31:00', '%H:%M:%S').time()) 
        & (df.index.time <= datetime.strptime('12:00:00', '%H:%M:%S').time())
    )
    df.loc[trading_morning, 'time_diff'] = df[trading_morning].index.to_series().diff().dt.total_seconds()
    
    # afternoon
    trading_afternoon = (
        (df.index.time >= datetime.strptime('13:01:00', '%H:%M:%S').time()) 
        & (df.index.time <= datetime.strptime('16:00:00', '%H:%M:%S').time())
    )
    df.loc[trading_afternoon, 'time_diff'] = df[trading_afternoon].index.to_series().diff().dt.total_seconds()
    
    return df['time_diff']

quote_df['time_diff'] = cal_time_diff(quote_df)
quote_df.plotDistrib(
    'time_diff', bins=100,
)
print(quote_df['time_diff'].describe())

count    123982.000000
mean          0.158708
std           0.330063
min           0.000000
25%           0.000000
50%           0.016000
75%           0.163000
max           6.567000
Name: time_diff, dtype: float64


In [11]:
# check big time gap
quote_df[quote_df['time_diff']>3]

Unnamed: 0_level_0,time,bid_price,ask_price,bid_size,ask_size,time_diff
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1900-01-01 09:55:55.002,09:55:55.002,176.1,176.2,3300.0,3700.0,3.159
1900-01-01 10:18:41.839,10:18:41.839,179.9,180.0,28800.0,15500.0,3.009
1900-01-01 10:20:10.179,10:20:10.179,179.7,179.8,400.0,14400.0,3.409
1900-01-01 10:22:55.142,10:22:55.142,178.5,178.6,100.0,29700.0,3.724
1900-01-01 10:27:45.770,10:27:45.770,179.7,179.8,11400.0,7800.0,4.919
...,...,...,...,...,...,...
1900-01-01 15:18:50.410,15:18:50.410,182.6,182.7,23600.0,2400.0,3.264
1900-01-01 15:21:10.218,15:21:10.218,182.2,182.4,16800.0,12300.0,3.608
1900-01-01 15:23:18.694,15:23:18.694,182.1,182.2,1500.0,8700.0,3.148
1900-01-01 15:34:22.546,15:34:22.546,182.2,182.3,5800.0,15100.0,3.352


In [19]:
# check quote time diff by time
quote_df.resample("5min")[['time_diff']].mean().hplot('time_diff')

so for the gap of quote updates, the closer to noon break, the bigger the gap.

but the inner result needs more data and need to know how the bbo orderbook rebuild

might just because the player is not active, or latency in machine memory

In [20]:
# check trade time diff by time
trade_df['time_diff'] = cal_time_diff(trade_df)
trade_df.plotDistrib(
    'time_diff', bins=100,
)
print(trade_df['time_diff'].describe())

count    66270.000000
mean         0.296921
std          0.785663
min          0.000000
25%          0.000000
50%          0.000000
75%          0.160000
max         13.264000
Name: time_diff, dtype: float64


In [21]:
trade_df.resample("5min")[['time_diff']].mean().hplot('time_diff')
