In [99]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("martinsn/high-frequency-crypto-limit-order-book-data")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/martinsn/high-frequency-crypto-limit-order-book-data?dataset_version_number=1...


 71%|███████   | 706M/993M [00:17<00:07, 41.6MB/s] 


KeyboardInterrupt: 

In [67]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt



In [None]:
df = pd.read_csv(path) #if this doesn't work, yall can find the path
#df.head()

if np.sum(df.isnull().any()) > 0:
    raise Exception("null values detected") #before we add in NA from windows, check
#seems that if no volume exists for a level it is cast to 0, idk



In [71]:
df["date"] = pd.to_datetime(df["system_time"]) 
#trades go on 24/7, so could normalize by day or by rolling window of last observations (except for first day, which I normalize to day 0)
#everything is in april but we can be more careful. 
#I imagine for not having to recompute everything we would just use half day intervals or something as we have latency? 

df["date"].head()
df["trading_day"] = df["date"].dt.dayofyear - np.min(df["date"].dt.dayofyear) 
#df["trading_day"].head()

In [72]:
#construct rolling volatility for midprice (or volatility per day?) 24 hours * 3600 seconds = 1 day and df is per second
window = 24 * 3600

df["log_midprice"] = np.log(df["midpoint"] / df["midpoint"].shift(1))
df["rolling_midprice_vol"] = df["log_midprice"].rolling(window = window).std() 
#I don't need to annualize this or make it weighted to daily std? 
#TODO if I made index time I could use timedelta to be sure. 



In [73]:
#weighted midprice? (from stack exchange)

# The definition of micro-price is
# S = Pa * Vb / (Va + Vb) + Pb * Va / (Va + Vb)
# where Pa is the ask price, Va is the ask volume, Pb is the bid price, and Vb is the bid volume.
#for speed I compute the best ask price, perhaps I weight the other distances???  
#should I just add all of the volumes for the 1 level lmao 
#also, for computing Va do I add the market/limits or just the limits? 

df["asks_price_0"] = df["midpoint"] * (1 + df["asks_distance_0"])
df["bids_price_0"] = df["midpoint"] * (1 + df["bids_distance_0"])
df["microprice_0"] = ( (df["asks_price_0"] * df["asks_limit_notional_0"] 
                      + df["bids_price_0"] * df["bids_limit_notional_0"]) / 
                      (df["asks_limit_notional_0"] + df["bids_limit_notional_0"]))

#sanity check that bids/asks match spreads. 
if np.sum(~np.isclose(df["asks_price_0"] - df["bids_price_0"], df["spread"], rtol = 1e-5)) > 0:
          raise Exception("spread doesn't match asks/bids")




In [74]:
#make ema of midpoint and microprice

# Exponential Moving Average (EMA)
window = 180 #180 seconds, i.e., 3 minutes. can be changed. 

df['ema_microprice_0'] = df["microprice_0"].ewm(span = window).mean()
df['ema_midpoint'] = df['midpoint'].ewm(span = window).mean()
#df[["ema_microprice_0", "microprice_0"]][2000:2010] #is a sanity check to ensure column not overwritten

In [77]:
#percent changes of midpoint
df["midpoint_pct_change"] = df["midpoint"].pct_change()

In [75]:
#create relative spreads up to bid/ask level n.
#because they are all percents of the midprice, 
n = 5
numer = 0
denom = 0
for i in range(1,n+1):
    numer += df["asks_distance_" + str(i)] - df["bids_distance_" + str(i)]
    denom += 2 + df["asks_distance_" + str(i)] - df["bids_distance_" + str(i)]
df["rel_spread_"+str(n)] = numer / denom

In [79]:
#create total volume for the first n levels
n = 15
df["asks_limit_volume"] = df["asks_limit_notional_0"] * 0
df["bids_limit_volume"] = df["bids_limit_notional_0"] * 0
for i in range(n):
    df["asks_limit_volume"] += df["asks_limit_notional_" + str(i)]
    df["bids_limit_volume"] += df["bids_limit_notional_" + str(i)]

In [83]:
#finally, make a couple of different normalizing functions
'''
given list of columns, computes the normalized features per day based on mean, std of previous day (half day? hour?) 
df: a dataframe requiring a datetime or date column. 
columns: a list of column names

returns: a list of the new columns created. 
'''
def normalize_per_day(df, columns): #pass by reference intensifies, unless we wish to make a new dataframe
    if "date" not in df.columns:
        raise Exception("no date")
    #if we want to do this in place, reverse loop order. averages over previous day
    new_columns = []
    for column in columns: 
        if column not in df.columns:
            raise Exception("You have selected a column, " + column + " not in the dataframe")
            #TODO clean up by removing the columns not in columns

        for day in range(1, len(df["trading_day"].unique())):
            prev_mean = np.mean(df[column][df["trading_day"] == day - 1])
            prev_std = np.std(df[column][df["trading_day"] == day - 1])
            df.loc[df["trading_day"] == day, column + "_norm"] = (df[column][df["trading_day"] == day] - prev_mean) / prev_std
        df.loc[df["trading_day"] == 0, column + "_norm"] = np.nan
        new_columns.append(column + "_norm")
    return new_columns 
#normalize_per_day(df, ["midpoint"])
#print(df["midpoint_norm"])


'''
given list of columns, computes the normalized features from rolling average of last couple observations (seconds)

df: a dataframe requiring a datetime or date column. 
columns: a list of column names
window: number of seconds the rolling window looks back. 

returns: a list of the new columns created. 
TODO should I name the columns to refer to the window used?
'''
def normalize_last_observs(df, columns, window = 20): #in seconds, could modify to time delta 
    new_columns = []
    for column in columns:
        df[column + "_norm"] = ((df[column] - df[column].rolling(window = window).mean()) /
                                (df[column].rolling(window = window).std() + 1e-8)) #stability
        new_columns.append(column + "_norm")
    return new_columns



In [94]:
cols_to_normalize = []
levels_to_keep = 5
["bids_distance_" + str(i) for i in range(5)]
for i in range(5+1):
    cols_to_normalize.append("bids_distance_" + str(i))
    cols_to_normalize.append("asks_distance_" + str(i))

cols_to_normalize.append("midpoint") #idk about this one, shrugs
cols_to_normalize.append("midpoint_pct_change")
cols_to_normalize.append("asks_limit_volume")
cols_to_normalize.append("bids_limit_volume")

new_columns = normalize_last_observs(df, cols_to_normalize, 3600*6) #quarter of a day, idk

In [96]:
columns_to_keep = []
columns_to_keep = new_columns
columns_to_keep + ["midpoint", "ema_midpoint",
                    "microprice", "ema_microprice_0",
                    "rolling_midprice_vol", "date"]
for i in range(5):
    columns_to_keep.append("bids_distance_"+str(i))
    columns_to_keep.append("asks_distance_"+str(i))
    columns_to_keep.append("asks_limit_notional_"+str(i))
    columns_to_keep.append("bids_limit_notional_"+str(i))

# Find columns to drop
columns_to_drop = df.columns.difference(columns_to_keep)

# Drop the columns not in the keep list
df = df.drop(columns=columns_to_drop)



In [98]:
df.to_csv("btc_1s_normed_features.csv")