In [1]:
from typing import *
from datetime import datetime, timedelta, date
from core.time_utils import Bounds
from tqdm import tqdm

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import polars as pl

warnings.simplefilter("ignore")

<p>In this notebook we will validate that everything was calculated correctly</p>

<h4>Fetch data from Hive structure</h4>

In [None]:
bounds: Bounds = Bounds.for_days(date(2025, 1, 1), date(2025, 1, 2))
# Expand bounds such that start and end of the daily interval will not have missing values
expanded_bounds: Bounds = bounds.expand_bounds(lb_timedelta=timedelta(minutes=5), rb_timedelta=timedelta(minutes=5))

df_ticks = (
    pl.scan_parquet(r"D:\data\transformed\trades")
    .filter(
        (pl.col("date").is_between(expanded_bounds.day0, expanded_bounds.day1)) & 
        (pl.col("symbol") == "BTC-USDT") &
        (pl.col("trade_time").is_between(expanded_bounds.start_inclusive, expanded_bounds.end_exclusive))
    )
    .collect()
)

In [None]:
print(expanded_bounds)

<h4>Make sure data is sorted by trade_time</h4>

In [None]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)
assert df_ticks["trade_time"].is_sorted()

In [None]:
df_ticks = df_ticks.with_columns(
    quote_abs=pl.col("price") * pl.col("quantity"),
    side=1 - 2 * pl.col("is_buyer_maker")  # -1 if SELL, 1 if BUY
)
df_ticks = df_ticks.with_columns(
    quote_sign=pl.col("quote_abs") * pl.col("side"),
    quantity_sign=pl.col("quantity") * pl.col("side")
)

<h4>Aggregate ticks into trades on trade_time</h4>

In [None]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)

df_trades: pl.DataFrame = (
    df_ticks
    .group_by("trade_time", maintain_order=True)
    .agg(
        price_first=pl.col("price").first(),  # if someone placed a trade with price impact, then price_first
        price_last=pl.col("price").last(),  # and price_last will differ
        # Amount spent in quote asset for the trade
        quote_abs=pl.col("quote_abs").sum(),
        quote_sign=pl.col("quote_sign").sum(),
        quantity_sign=pl.col("quantity_sign").sum(),
        # Amount of base asset transacted
        quantity_abs=pl.col("quantity").sum(),
        num_ticks=pl.col("price").count(),  # number of ticks for each trade
    )
)
# Create boolean indicating if the trade was long or short
df_trades = df_trades.with_columns(
    (pl.col("quote_sign") >= 0).alias("is_long")
)

df_trades

In [None]:
# compute slippage
df_trades = df_trades.with_columns(
    quote_slippage_abs=(pl.col("quote_abs") - pl.col("price_first") * pl.col("quantity_abs")).abs()
)
df_trades = df_trades.with_columns(
    quote_slippage_sign=pl.col("quote_slippage_abs") * pl.col("quantity_sign").sign()
)

<h4>Add lags of time and price will help when resampling</h4>

In [None]:
df_trades = df_trades.with_columns(
    price_last_lag=pl.col("price_last").shift(1),
    trade_time_lag=pl.col("trade_time").shift(1),
    price_first_neg_lag=pl.col("price_first").shift(-1),
    trade_time_neg_lag=pl.col("trade_time").shift(-1)
)

In [None]:
sampled_features: pl.DataFrame = (
    df_trades
    .group_by_dynamic(
        index_column="trade_time", 
        every=timedelta(milliseconds=500),
        period=timedelta(seconds=5),
        closed="right",
        label="right",
        include_boundaries=True
    )
    .agg(
        asset_return=(pl.col("price_first_neg_lag").last() / pl.col("price_last_lag").first() - 1) * 1e4,
        asset_hold_time=(pl.col("trade_time_neg_lag").last() - pl.col("trade_time_lag").first()).dt.total_nanoseconds() / 1e9,
        flow_imbalance=pl.col("quote_sign").sum() / pl.col("quote_abs").sum(),
        slippage_imbalance=pl.col("quote_slippage_sign").sum() / pl.col("quote_slippage_abs").sum(),
        powerlaw_alpha= 1 + pl.len() / (pl.col("quote_abs") / pl.col("quote_abs").min()).log().sum(),
        share_of_longs=pl.col("is_long").sum() / pl.len(),
        num_agg_trades=pl.len()
    )
    .filter(
        pl.col("trade_time").is_between(bounds.start_inclusive, bounds.end_exclusive)
    )
)

In [None]:
index: pd.DatetimeIndex = pd.date_range(
    bounds.start_inclusive, bounds.end_exclusive, freq=timedelta(milliseconds=500)
)
df_index: pl.DataFrame = pl.DataFrame({"sampled_time": index})

# left join to desired time index to make sure that dimensions are correct
df_features: pl.DataFrame = df_index.join(
    sampled_features, left_on="sampled_time", right_on="trade_time", how="left"
)
df_features = df_features.with_columns(
    window=pl.lit("500MS")
)

In [None]:
cols= [
    "asset_return",
    "asset_hold_time",
    "flow_imbalance",
    "slippage_imbalance",
    "powerlaw_alpha",
    "share_of_longs"
]

<h4>Validate that everything is computed correctly</h4>

In [2]:
from core.paths import FEATURE_DIR
from core.time_utils import Bounds
from datetime import date
from feature_writer.utils import to_wide_format
from core.currency_pair import CurrencyPair


import polars as pl


hive: pl.LazyFrame = pl.scan_parquet(FEATURE_DIR)
bounds: Bounds = Bounds.for_days(
    date(2024, 1, 1), date(2024, 1, 4)
)

data = hive.filter(
    (pl.col("currency_pair") == CurrencyPair.from_string("BTC-USDT").name) &
    (pl.col("date").is_between(bounds.day0, bounds.day1))
).collect()

In [4]:
to_wide_format(data)

sampled_time,currency_pair,asset_hold_time_10S,asset_hold_time_120S,asset_hold_time_1S,asset_hold_time_2S,asset_hold_time_300S,asset_hold_time_30S,asset_hold_time_500MS,asset_hold_time_5S,asset_hold_time_60S,asset_return_10S,asset_return_120S,asset_return_1S,asset_return_2S,asset_return_300S,asset_return_30S,asset_return_500MS,asset_return_5S,asset_return_60S,flow_imbalance_10S,flow_imbalance_120S,flow_imbalance_1S,flow_imbalance_2S,flow_imbalance_300S,flow_imbalance_30S,flow_imbalance_500MS,flow_imbalance_5S,flow_imbalance_60S,powerlaw_alpha_10S,powerlaw_alpha_120S,powerlaw_alpha_1S,powerlaw_alpha_2S,powerlaw_alpha_300S,powerlaw_alpha_30S,powerlaw_alpha_500MS,powerlaw_alpha_5S,powerlaw_alpha_60S,share_of_long_trades_10S,share_of_long_trades_120S,share_of_long_trades_1S,share_of_long_trades_2S,share_of_long_trades_300S,share_of_long_trades_30S,share_of_long_trades_500MS,share_of_long_trades_5S,share_of_long_trades_60S,slippage_imbalance_10S,slippage_imbalance_120S,slippage_imbalance_1S,slippage_imbalance_2S,slippage_imbalance_300S,slippage_imbalance_30S,slippage_imbalance_500MS,slippage_imbalance_5S,slippage_imbalance_60S
datetime[ns],str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2024-01-01 00:00:00,"""BTC-USDT""",,,,,,,,,,,,,,,,,,,,,,,,,-1.0,,,,,,,,,inf,,,,,,,,,0.0,,,,,,,,,,,
2024-01-01 00:00:00.500,"""BTC-USDT""",,,,,,,0.916,,,,,,,,,0.002365,,,,,-0.031516,,,,-0.006088,,,,,1.830844,,,,1.771817,,,,,0.416667,,,,0.454545,,,,,0.59792,,,,0.59792,,
2024-01-01 00:00:01,"""BTC-USDT""",,,1.001,,,,0.505,,,,,0.002365,,,,0.002365,,,,,-0.561143,,,,-0.857143,,,,,1.635442,,,,1.779743,,,,,0.461538,,,,0.5,,,,,0.59792,,,,,,
2024-01-01 00:00:01.500,"""BTC-USDT""",,,1.03,,,,0.609,,,,,0.0,,,,0.0,,,,,-0.4819,-0.467584,,,-0.457954,,,,,1.242083,1.270381,,,1.244701,,,,,0.230769,0.289474,,,0.208333,,,,,1.0,0.819455,,,1.0,,
2024-01-01 00:00:02,"""BTC-USDT""",,,1.221,2.138,,,0.663,,,,,0.0,0.0,,,-0.002365,,,,,0.10062,0.066155,,,0.81596,,,,,1.253723,1.267455,,,1.349468,,,,,0.230769,0.288462,,,0.266667,,,,,1.0,0.993349,,,1.0,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
2024-01-03 23:59:57.500,"""BTC-USDT""",10.595,120.46,1.431,2.55,300.438,31.054,0.907,5.531,61.099,0.0,6.90869,0.0,0.0,5.4131,3.581612,-0.002334,0.0,3.670367,-0.186899,0.056985,0.010181,-0.828636,-0.063079,0.585288,-0.958087,-0.442614,-0.005005,1.388809,1.178424,1.3705,1.369208,1.176213,1.302363,1.469951,1.340354,1.31637,0.364865,0.49247,0.333333,0.185185,0.41762,0.444444,0.25,0.222222,0.449153,1.508689,0.845417,,-1.0,0.892037,1.0,,0.76201,0.837502
2024-01-03 23:59:58,"""BTC-USDT""",10.56,120.937,1.384,2.461,300.915,31.531,0.893,5.466,60.661,0.002334,6.911026,0.0,0.002334,5.415436,3.583947,0.002334,0.002334,3.675038,-0.186774,0.056963,-0.959075,-0.642649,-0.063086,0.585154,-1.0,-0.434697,-0.00351,1.389407,1.178502,1.514974,1.443734,1.176243,1.303497,inf,1.348445,1.315907,0.364865,0.491729,0.2,0.136364,0.417381,0.441558,0.0,0.222222,0.452991,1.508689,0.845417,,-1.0,0.892037,1.0,,0.76201,0.837502
2024-01-03 23:59:58.500,"""BTC-USDT""",10.904,120.696,1.701,2.716,300.682,30.669,1.285,6.069,60.713,0.0,6.90869,0.0,0.0,5.4131,3.581612,0.0,-0.002334,3.670367,-0.100852,0.05545,-0.577982,-0.002844,-0.063005,0.588987,-0.178571,-0.435784,-0.00335,1.401425,1.178794,3.508759,1.487043,1.176345,1.306232,6.539965,1.355507,1.317332,0.383562,0.491729,0.333333,0.333333,0.41762,0.444444,0.5,0.222222,0.452991,1.508689,0.845417,,,0.892037,1.0,,0.76201,0.837502
2024-01-03 23:59:59,"""BTC-USDT""",10.449,120.696,1.285,2.192,300.682,30.322,,5.207,60.644,0.0,6.90869,0.0,-0.002334,5.4131,3.581612,,-0.002334,3.670367,-0.100019,0.05545,-0.178571,-0.940104,-0.063005,0.58837,,-0.361121,-0.00351,1.399912,1.178794,6.539965,1.648385,1.176345,1.305689,,1.355963,1.317259,0.388889,0.491729,0.5,0.285714,0.41762,0.44,,0.225,0.451429,1.508689,0.845417,,,0.892037,1.0,,0.76201,0.837502
