<h4>Features 11-04-2024</h4>

In [3]:
from pathlib import Path
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import *


import polars as pl
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt


ROOT_DIR = Path(os.getcwd()).parent

In [2]:
@dataclass
class PumpEvent:
    pump_id: int
    ticker: str
    time: str
    exchange: str

    def __post_init__(self):
        self.time: pd.Timestamp = pd.Timestamp(self.time)

    def __str__(self):
        return f"Pump event: {self.ticker} - {str(self.time)} on {self.exchange}"

<h4>Load data</h4>

In [3]:
# data is organized by days
pump = PumpEvent(
    pump_id=1, ticker="DNTBTC", time="2021-02-03 19:00:06", exchange="binance"
)

def load_data(pump: PumpEvent, lookback_delta: timedelta) -> pd.DataFrame:

    start: pd.Timestamp = pump.time.floor("1h")- lookback_delta
    end: pd.Timestamp = pump.time.floor("1h")

    date_range: List[pd.Timestamp] = pd.date_range(
        start=start,
        end=end,
        freq="D",
    ).tolist()

    df: pd.DataFrame = pd.DataFrame()

    for date in date_range:
        file_name: str = f"{pump.ticker}-trades-{date.date()}.parquet"
        df_date: pd.DataFrame = pd.read_parquet(
            os.path.join(
                ROOT_DIR, f"data/trades_parquet/{pump.exchange}/{pump.ticker}", file_name
            )
        )
        df = pd.concat([df, df_date])

    df["time"] = pd.to_datetime(df["time"], unit="ms")

    df = df[
        (df["time"] >= start) & (df["time"] <= end)
    ].reset_index(drop=True)

    return df

In [None]:
# check if rounding logic is correct
pump.time

In [None]:
pump.time.round("1h") - timedelta(hours=1)

In [4]:
df: pd.DataFrame = load_data(pump=pump, lookback_delta=timedelta(days=30))
df["quote"] = df["price"] * df["qty"]
df.head()

Unnamed: 0,price,qty,time,isBuyerMaker,quote
0,1e-06,1882.0,2021-01-04 19:04:10.774,True,0.002691
1,1e-06,3873.0,2021-01-04 19:04:10.774,True,0.005538
2,1e-06,3337.0,2021-01-04 19:04:10.774,True,0.004772
3,1e-06,2289.0,2021-01-04 19:04:10.774,True,0.003273
4,1e-06,113.0,2021-01-04 19:05:13.068,False,0.000163


In [5]:
df[df["time"] >= pump.time]

Unnamed: 0,price,qty,time,isBuyerMaker,quote


<h4>Create features</h4>

<p>Create trades from ticks by aggregation by time </p>

In [6]:
df["qty_sign"] = (1 - 2 * df["isBuyerMaker"]) * df["qty"]
df["quote_sign"] = (1 - 2 * df["isBuyerMaker"]) * df["quote"]

df_trades: pd.DataFrame = df.groupby("time").agg(
    price_first=("price", "first"),
    price_last=("price", "last"),
    price_max=("price", "max"),
    price_min=("price", "min"),
    qty_sign=("qty_sign", "sum"),
    qty_abs=("qty", "sum"),
    quote_sign=("quote_sign", "sum"),
    quote_abs=("quote", "sum"),
    # Add BTC slippage
)

df_trades = df_trades.reset_index()
df_trades.head(2)

Unnamed: 0,time,price_first,price_last,price_max,price_min,qty_sign,qty_abs,quote_sign,quote_abs
0,2021-01-04 19:04:10.774,1e-06,1e-06,1e-06,1e-06,-11381.0,11381.0,-0.016275,0.016275
1,2021-01-04 19:05:13.068,1e-06,1e-06,1e-06,1e-06,113.0,113.0,0.000163,0.000163


<h4>Calculate Exchange volume features</h4>

In [7]:
df_cmc: pd.DataFrame = pd.read_parquet(
    os.path.join(ROOT_DIR, "data/cmc/cmc_snapshots.parquet")
)

df_cmc["date"] = pd.to_datetime(df_cmc["snapshot"], format="%Y%m%d")
df_cmc = df_cmc.sort_values(by="date", ascending=True)

df_cmc.head(2)

Unnamed: 0,name,symbol,slug,cmc_rank,mcap_usdt,mcap_btc,snapshot,trading_volume_usdt,trading_volume_btc,date
0,Bitcoin,BTC,bitcoin,1,16050410000.0,16655710.0,20170101,147775200.0,153348.114011,2017-01-01
420,Evotion,EVO,evotion,421,5026.011,5.215554,20170101,2.010798,0.002087,2017-01-01


In [8]:
# leave only the ticker of interest during the pump time
df_cmc_ticker: pd.DataFrame = df_cmc[
    (df_cmc["symbol"] == "DNT") &
    (df_cmc["date"] < pump.time.floor("1d")) &
    (df_cmc["date"] >= pump.time.floor("1d") - timedelta(days=30))
].copy()

In [None]:
daily_exchange_vol_features = {}

df_trades["date"] = df_trades["time"].dt.floor("1d")

df_daily_vol: pd.DataFrame = (
    df_trades.groupby("date")["quote_abs"].sum().to_frame().reset_index()
)
df_daily_vol = df_daily_vol[
    df_daily_vol["date"] < pump.time.floor("1d")
].copy()

df_daily_vol = df_daily_vol.merge(
    df_cmc_ticker[["date", "trading_volume_btc"]], on="date", how="left"
)

df_daily_vol["daily_exchange_volume_share"] = df_daily_vol["quote_abs"] / df_daily_vol["trading_volume_btc"]

window_sizes: List[timedelta] = [
    timedelta(days=3),
    timedelta(days=7),
    timedelta(days=14),
    timedelta(days=30)
]

window_names: List[str] = [
    "3d", "7d", "14d", "30d"
]

for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_daily_vol[
        (df_daily_vol["date"] >= pump.time.floor("1d") - window_size) &
        (df_daily_vol["date"] < pump.time.floor("1d")) # unnecessary just to make sure
    ].copy()

    daily_exchange_vol_features[f"daily_exchange_volume_share_{window_name}_mean"] = (
        df_window["daily_exchange_volume_share"].mean() 
    )
    daily_exchange_vol_features[f"daily_exchange_volume_share_{window_name}_std"] = (
        df_window["daily_exchange_volume_share"].std() 
    )

In [None]:
daily_exchange_vol_features

In [None]:
# calculate daily volume in BTC
df_trades["date"] = df_trades["time"].dt.floor("1d")
df_trades.head(2)

In [None]:
pump.time

In [None]:
df_daily_vol: pd.DataFrame = (
    df_trades.groupby("date")["quote_abs"].sum().to_frame().reset_index()
)

df_daily_vol = df_daily_vol[
    df_daily_vol["date"] < pump.time.floor("1d")
].copy()

In [None]:
df_daily_vol.tail()

In [None]:
df_daily_vol = df_daily_vol.merge(
    df_cmc_ticker[["date", "trading_volume_btc"]], on="date", how="left"
)

In [None]:
df_daily_vol

In [None]:
df_daily_vol["daily_exchange_volume_share"] = df_daily_vol["quote_abs"] / df_daily_vol["trading_volume_btc"]

df_daily_vol.plot(
    x="date", y="daily_exchange_volume_share"
)

plt.axvline(x=pump.time.floor("1d"), color="red")
plt.show()

In [None]:
df_daily_vol[
    (df_daily_vol["date"] >= pump.time.floor("1d") - timedelta(days=7)) &
    (df_daily_vol["date"] < pump.time.floor("1d"))
]

<h4>BTC slippage</h4>

$$\text{BTC\_lost\_to\_slippage} = \underbrace{\sum_{i=1}^{N}{\text{qty\_sign}_i} \cdot P_i}_{\text{Quote actually spent}} - \underbrace{\sum_{i=1}^N{\text{qty\_sign}_i} \cdot P_0}_{\text{Quote could have been spent if filled at best price}}$$

In [None]:
df_trades["quote_slippage"] = df_trades["quote_sign"] - df_trades["qty_sign"] * df_trades["price_first"]

df_trades["quote_slippage"].cumsum().plot()
plt.show()

In [None]:
# Slippage imbalance ratio
window_names: List[str] = [
    "1d", "7d", "14d", "30d"
]
window_sizes: List[timedelta] = [
    timedelta(days=1), timedelta(days=7), timedelta(days=14), timedelta(days=30)
]

slippage_features: Dict[str, float] = {}

for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_trades[
        (df_trades["time"] >= pump.time.floor("1h") - window_size) &
        (df_trades["time"] <= pump.time.floor("1h"))
    ].copy()

    quote_slippage_net: float = (df_window["quote_slippage"] * np.sign(df_window["qty_sign"])).sum()
    quote_slippage_abs: float = df_window["quote_slippage"].sum()

    slippage_features[f"slippage_imbalance_ratio_{window_name}"] = quote_slippage_net / quote_slippage_abs