In [5]:
from typing import *
from datetime import datetime, timedelta, date
from core.time_utils import Bounds
from tqdm import tqdm

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import numpy as np
import polars as pl

warnings.simplefilter("ignore")

In [17]:
bounds: Bounds = Bounds.for_days(
    date(2025, 1, 1), date(2025, 1, 2)
)

df_ticks = (
    pl.scan_parquet(r"D:\data\transformed\trades")
    .filter(
        (pl.col("date").is_between(bounds.day0, bounds.day1)) & 
        (pl.col("symbol") == "BTC-USDT") &
        (pl.col("trade_time").is_between(bounds.start_inclusive, bounds.end_exclusive))
    )
    .collect()
)

In [20]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)
assert df_ticks["trade_time"].is_sorted()

In [21]:
df_ticks = df_ticks.with_columns(
    quote_abs=pl.col("price") * pl.col("quantity"),
    side=1 - 2 * pl.col("is_buyer_maker")  # -1 if SELL, 1 if BUY
)
df_ticks = df_ticks.with_columns(
    quote_sign=pl.col("quote_abs") * pl.col("side"),
    quantity_sign=pl.col("quantity") * pl.col("side")
)

In [22]:
df_ticks = df_ticks.sort(by="trade_time", descending=False)

df_trades: pl.DataFrame = (
    df_ticks
    .group_by("trade_time", maintain_order=True)
    .agg(
        price_first=pl.col("price").first(),  # if someone placed a trade with price impact, then price_first
        price_last=pl.col("price").last(),  # and price_last will differ
        # Amount spent in quote asset for the trade
        quote_abs=pl.col("quote_abs").sum(),
        quote_sign=pl.col("quote_sign").sum(),
        quantity_sign=pl.col("quantity_sign").sum(),
        # Amount of base asset transacted
        quantity_abs=pl.col("quantity").sum(),
        num_ticks=pl.col("price").count(),  # number of ticks for each trade
    )
)
# Create boolean indicating if the trade was long or short
df_trades = df_trades.with_columns(
    (pl.col("quote_sign") >= 0).alias("is_long")
)

df_trades

trade_time,price_first,price_last,quote_abs,quote_sign,quantity_sign,quantity_abs,num_ticks,is_long
datetime[ns],f64,f64,f64,f64,f64,f64,u32,bool
2025-01-01 00:00:00.010866,93576.0,93576.0,127.26336,-127.26336,-0.00136,0.00136,1,false
2025-01-01 00:00:00.074095,93576.0,93576.0,342.48816,-342.48816,-0.00366,0.00366,2,false
2025-01-01 00:00:00.091046,93576.0,93576.0,735.50736,-735.50736,-0.00786,0.00786,2,false
2025-01-01 00:00:00.266955,93576.01,93576.01,170.308338,170.308338,0.00182,0.00182,1,true
2025-01-01 00:00:00.299766,93576.0,93576.0,30007.01592,-30007.01592,-0.32067,0.32067,11,false
…,…,…,…,…,…,…,…,…
2025-01-01 23:59:58.462629,94591.78,94591.78,454.986462,-454.986462,-0.00481,0.00481,1,false
2025-01-01 23:59:58.814181,94591.78,94591.78,92.699944,-92.699944,-0.00098,0.00098,1,false
2025-01-01 23:59:58.941706,94591.78,94591.78,32.161205,-32.161205,-0.00034,0.00034,1,false
2025-01-01 23:59:59.438082,94591.79,94591.79,87.024447,87.024447,0.00092,0.00092,1,true


In [23]:
df_trades = df_trades.with_columns(
    price_last_lag=pl.col("price_last").shift(1),
    trade_time_lag=pl.col("trade_time").shift(1),
    price_first_neg_lag=pl.col("price_first").shift(-1),
    trade_time_neg_lag=pl.col("trade_time").shift(-1)
)

In [24]:
# compute slippage
df_trades = df_trades.with_columns(
    quote_slippage_abs=(pl.col("quote_abs") - pl.col("price_first") * pl.col("quantity_abs")).abs()
)
df_trades = df_trades.with_columns(
    quote_slippage_sign=pl.col("quote_slippage_abs") * pl.col("quantity_sign").sign()
)

In [38]:
index = pd.date_range(
    bounds.start_inclusive,
    bounds.end_exclusive,
    freq=timedelta(milliseconds=500),
)

df_index: pl.DataFrame = pl.DataFrame({
    "sampled_time": index
})

df_s = df_index.join(features, left_on="sampled_time", right_on="trade_time", how="left")

In [39]:
df_s

sampled_time,asset_return,asset_hold_time,flow_imbalance,slippage_imbalance,powerlaw_alpha,share_of_longs,num_agg_trades
datetime[ns],f64,f64,f64,f64,f64,f64,u32
2025-01-01 00:00:00,,,,,,,
2025-01-01 00:00:00.500,,,,,,,
2025-01-01 00:00:01,,,,,,,
2025-01-01 00:00:01.500,,,,,,,
2025-01-01 00:00:02,,,,,,,
…,…,…,…,…,…,…,…
2025-01-01 23:59:57.500,-0.256887,16.043331,0.377136,-0.99515,1.244561,0.375,88
2025-01-01 23:59:58,-0.256887,16.043331,0.377136,-0.99515,1.244561,0.375,88
2025-01-01 23:59:58.500,-0.256887,15.438697,0.365185,-0.99515,1.247207,0.376344,93
2025-01-01 23:59:59,-0.256887,15.767716,0.363557,-0.99515,1.245323,0.347826,92


In [27]:
features = (
    df_trades
    .group_by_dynamic(
        index_column="trade_time", 
        every=timedelta(milliseconds=500),
        period=timedelta(seconds=15),
        closed="right",
        label="right",
    )
    .agg(
        asset_return=(pl.col("price_first_neg_lag").last() / pl.col("price_last_lag").first() - 1) * 1e4,
        asset_hold_time=(pl.col("trade_time_neg_lag").last() - pl.col("trade_time_lag").first()).dt.total_nanoseconds() / 1e9,
        flow_imbalance=pl.col("quote_sign").sum() / pl.col("quote_abs").sum(),
        slippage_imbalance=pl.col("quote_slippage_sign").sum() / pl.col("quote_slippage_abs").sum(),
        powerlaw_alpha= 1 + pl.len() / (pl.col("quote_abs") / pl.col("quote_abs").min()).log().sum(),
        share_of_longs=pl.col("is_long").sum() / pl.len(),
        num_agg_trades=pl.len()
    )
    .filter(
        pl.col("trade_time").is_between(bounds.start_inclusive, bounds.end_exclusive)
    )
)

In [29]:
features

trade_time,asset_return,asset_hold_time,flow_imbalance,slippage_imbalance,powerlaw_alpha,share_of_longs,num_agg_trades
datetime[ns],f64,f64,f64,f64,f64,f64,u32
2025-01-01 00:00:15,,,-0.745073,-1.0,1.32021,0.459716,211
2025-01-01 00:00:15.500,-2.97619,15.128935,-0.707661,-1.0,1.325113,0.473684,209
2025-01-01 00:00:16,-2.97619,15.305759,-0.692978,-1.0,1.321661,0.504854,206
2025-01-01 00:00:16.500,-4.115371,15.28254,-0.727626,-1.0,1.315989,0.453333,225
2025-01-01 00:00:17,-4.114302,15.212003,-0.721735,-1.0,1.315476,0.460177,226
…,…,…,…,…,…,…,…
2025-01-01 23:59:57.500,-0.256887,16.043331,0.377136,-0.99515,1.244561,0.375,88
2025-01-01 23:59:58,-0.256887,16.043331,0.377136,-0.99515,1.244561,0.375,88
2025-01-01 23:59:58.500,-0.256887,15.438697,0.365185,-0.99515,1.247207,0.376344,93
2025-01-01 23:59:59,-0.256887,15.767716,0.363557,-0.99515,1.245323,0.347826,92


In [None]:
features.upsample(
    time_column="trade_time",
    every=timedelta(milliseconds=500),
)

In [None]:
2 * 3600 * 24

In [None]:
for dt, group in features.group_by_dynamic(
    index_column="trade_time", every=timedelta(days=1), period=timedelta(days=1), include_boundaries=True
):
    print(group.shape)


In [None]:
group