In [1]:
from typing import *
from datetime import datetime, timedelta, date
from core.time_utils import Bounds
from tqdm import tqdm

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import polars as pl

warnings.simplefilter("ignore")

<p>In this notebook we will validate that everything was calculated correctly</p>

<h4>Fetch data from Hive structure</h4>

In [2]:
bounds: Bounds = Bounds.for_days(date(2025, 1, 1), date(2025, 1, 2))
# Expand bounds such that start and end of the daily interval will not have missing values
expanded_bounds: Bounds = bounds.expand_bounds(lb_timedelta=timedelta(minutes=5), rb_timedelta=timedelta(minutes=5))

df_ticks = (
    pl.scan_parquet(r"D:\data\transformed\trades")
    .filter(
        (pl.col("date").is_between(expanded_bounds.day0, expanded_bounds.day1)) & 
        (pl.col("symbol") == "BTC-USDT") &
        (pl.col("trade_time").is_between(expanded_bounds.start_inclusive, expanded_bounds.end_exclusive))
    )
    .collect()
)

In [3]:
df_ticks

price,quantity,trade_time,is_buyer_maker,date,symbol
f64,f64,datetime[ns],bool,date,str
93646.97,0.0007,2024-12-31 23:55:00.041,false,2024-12-31,"""BTC-USDT"""
93646.97,0.00071,2024-12-31 23:55:00.410,false,2024-12-31,"""BTC-USDT"""
93646.96,0.00342,2024-12-31 23:55:00.975,true,2024-12-31,"""BTC-USDT"""
93646.97,0.00015,2024-12-31 23:55:01.336,false,2024-12-31,"""BTC-USDT"""
93646.97,0.01067,2024-12-31 23:55:02.135,false,2024-12-31,"""BTC-USDT"""
…,…,…,…,…,…
94473.64,0.00006,2025-01-02 00:04:59.909962,true,2025-01-02,"""BTC-USDT"""
94473.64,0.00006,2025-01-02 00:04:59.909962,true,2025-01-02,"""BTC-USDT"""
94473.3,0.00006,2025-01-02 00:04:59.909962,true,2025-01-02,"""BTC-USDT"""
94473.3,0.00006,2025-01-02 00:04:59.909962,true,2025-01-02,"""BTC-USDT"""


<h4>Make sure data is sorted by trade_time</h4>

In [None]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)
assert df_ticks["trade_time"].is_sorted()

In [None]:
df_ticks = df_ticks.with_columns(
    quote_abs=pl.col("price") * pl.col("quantity"),
    side=1 - 2 * pl.col("is_buyer_maker")  # -1 if SELL, 1 if BUY
)
df_ticks = df_ticks.with_columns(
    quote_sign=pl.col("quote_abs") * pl.col("side"),
    quantity_sign=pl.col("quantity") * pl.col("side")
)

<h4>Aggregate ticks into trades on trade_time</h4>

In [None]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)

df_trades: pl.DataFrame = (
    df_ticks
    .group_by("trade_time", maintain_order=True)
    .agg(
        price_first=pl.col("price").first(),  # if someone placed a trade with price impact, then price_first
        price_last=pl.col("price").last(),  # and price_last will differ
        # Amount spent in quote asset for the trade
        quote_abs=pl.col("quote_abs").sum(),
        quote_sign=pl.col("quote_sign").sum(),
        quantity_sign=pl.col("quantity_sign").sum(),
        # Amount of base asset transacted
        quantity_abs=pl.col("quantity").sum(),
        num_ticks=pl.col("price").count(),  # number of ticks for each trade
    )
)
# Create boolean indicating if the trade was long or short
df_trades = df_trades.with_columns(
    (pl.col("quote_sign") >= 0).alias("is_long")
)

df_trades

In [None]:
# compute slippage
df_trades = df_trades.with_columns(
    quote_slippage_abs=(pl.col("quote_abs") - pl.col("price_first") * pl.col("quantity_abs")).abs()
)
df_trades = df_trades.with_columns(
    quote_slippage_sign=pl.col("quote_slippage_abs") * pl.col("quantity_sign").sign()
)

<h4>Add lags of time and price will help when resampling</h4>

In [None]:
df_trades = df_trades.with_columns(
    price_last_lag=pl.col("price_last").shift(1),
    trade_time_lag=pl.col("trade_time").shift(1),
    price_first_neg_lag=pl.col("price_first").shift(-1),
    trade_time_neg_lag=pl.col("trade_time").shift(-1)
)

In [None]:
sampled_features: pl.DataFrame = (
    df_trades
    .group_by_dynamic(
        index_column="trade_time", 
        every=timedelta(milliseconds=500),
        period=timedelta(seconds=1),
        closed="right",
        label="right",
        include_boundaries=True
    )
    .agg(
        asset_return=(pl.col("price_last").last() / pl.col("price_first").first() - 1) * 1e4,
        asset_hold_time=(pl.col("trade_time").last() - pl.col("trade_time").first()).dt.total_nanoseconds() / 1e9,
        flow_imbalance=pl.col("quote_sign").sum() / pl.col("quote_abs").sum(),
        slippage_imbalance=pl.col("quote_slippage_sign").sum() / pl.col("quote_slippage_abs").sum(),
        powerlaw_alpha= 1 + pl.len() / (pl.col("quote_abs") / pl.col("quote_abs").min()).log().sum(),
        share_of_longs=pl.col("is_long").sum() / pl.len(),
        num_agg_trades=pl.len()
    )
    .filter(
        pl.col("trade_time").is_between(bounds.start_inclusive, bounds.end_exclusive)
    )
)

In [None]:
sampled_features.with_columns(
    asset_return_adj=pl.when(pl.col("asset_hold_time") > 0)
    .then(pl.col("asset_return") * timedelta(seconds=1).total_seconds() / pl.col("asset_hold_time"))
    .otherwise(0)
)

In [None]:
asset_return = sampled_features["asset_return"].to_numpy()
hold_time_sec = sampled_features["asset_hold_time"].to_numpy()

asset_return * timedelta(seconds=1).total_seconds() / hold_time_sec

In [None]:
index: pd.DatetimeIndex = pd.date_range(
    bounds.start_inclusive, bounds.end_exclusive, freq=timedelta(milliseconds=500)
)
df_index: pl.DataFrame = pl.DataFrame({"sampled_time": index})

# left join to desired time index to make sure that dimensions are correct
df_features: pl.DataFrame = df_index.join(
    sampled_features, left_on="sampled_time", right_on="trade_time", how="left"
)
df_features = df_features.with_columns(
    window=pl.lit("500MS")
)

In [None]:
cols= [
    "asset_return",
    "asset_hold_time",
    "flow_imbalance",
    "slippage_imbalance",
    "powerlaw_alpha",
    "share_of_longs"
]

<h4>Validate that everything is computed correctly</h4>

In [None]:
from core.paths import FEATURE_DIR
from core.time_utils import Bounds
from datetime import date
from feature_writer.utils import to_wide_format
from core.currency_pair import CurrencyPair


import polars as pl


hive: pl.LazyFrame = pl.scan_parquet(FEATURE_DIR)
bounds: Bounds = Bounds.for_days(
    date(2024, 1, 1), date(2024, 1, 4)
)

data = hive.filter(
    (pl.col("currency_pair") == CurrencyPair.from_string("BTC-USDT").name) &
    (pl.col("date").is_between(bounds.day0, bounds.day1))
).collect()

In [None]:
to_wide_format(data)