<h4>Features 11-04-2024</h4>

In [7]:
from pathlib import Path
from datetime import datetime, timedelta
from dataclasses import dataclass
from typing import *


import polars as pl
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt


ROOT_DIR = Path(os.getcwd()).parent

In [8]:
@dataclass
class PumpEvent:
    pump_id: int
    ticker: str
    time: str
    exchange: str

    def __post_init__(self):
        self.time: pd.Timestamp = pd.Timestamp(self.time)

    def __str__(self):
        return f"Pump event: {self.ticker} - {str(self.time)} on {self.exchange}"

<h4>Load data</h4>

In [9]:
# data is organized by days
pump = PumpEvent(
    pump_id=1, ticker="NXSBTC", time="2019-10-02 18:00:06", exchange="binance"
)

def load_data(pump_event: PumpEvent, lookback_delta: timedelta) -> pd.DataFrame:

    start: pd.Timestamp = pump_event.time.round("1h") - lookback_delta - timedelta(hours=1)
    end: pd.Timestamp = pump_event.time.round("1h") - timedelta(hours=1)

    date_range: List[pd.Timestamp] = pd.date_range(
        start=start,
        end=end,
        freq="D",
    ).tolist()

    df: pd.DataFrame = pd.DataFrame()

    for date in date_range:
        file_name: str = f"{pump_event.ticker}-trades-{date.date()}.parquet"
        df_date: pd.DataFrame = pd.read_parquet(
            os.path.join(
                ROOT_DIR, f"data/trades_parquet/{pump_event.exchange}/{pump_event.ticker}", file_name
            )
        )
        df = pd.concat([df, df_date])

    df["time"] = pd.to_datetime(df["time"], unit="ms")

    df = df[
        (df["time"] >= start) & (df["time"] <= end)
    ].reset_index(drop=True)

    return df

In [10]:
(pump.time - pd.Timestamp("2022-01-01")).days

-822

In [11]:
df: pd.DataFrame = load_data(pump_event=pump, lookback_delta=timedelta(days=60))
df["quote"] = df["price"] * df["qty"]
df.head()

Unnamed: 0,price,qty,time,isBuyerMaker,quote
0,2.6e-05,0.3,2019-08-03 17:04:10.179,True,8e-06
1,2.6e-05,500.0,2019-08-03 17:04:10.179,True,0.0128
2,2.6e-05,4.27,2019-08-03 17:04:10.179,True,0.000109
3,2.6e-05,22.28,2019-08-03 17:04:10.179,True,0.00057
4,2.6e-05,31.13,2019-08-03 17:04:32.735,True,0.000797


<h4>Create features</h4>

<p>Create trades from ticks by aggregation by time </p>

In [12]:
df["qty_sign"] = (1 - 2 * df["isBuyerMaker"]) * df["qty"]
df["quote_sign"] = (1 - 2 * df["isBuyerMaker"]) * df["quote"]

df_trades: pd.DataFrame = df.groupby("time").agg(
    price_first=("price", "first"),
    price_last=("price", "last"),
    price_max=("price", "max"),
    price_min=("price", "min"),
    qty_sign=("qty_sign", "sum"),
    qty_abs=("qty", "sum"),
    quote_sign=("quote_sign", "sum"),
    quote_abs=("quote", "sum")

)

df_trades = df_trades.reset_index()
df_trades.head(2)

Unnamed: 0,time,price_first,price_last,price_max,price_min,qty_sign,qty_abs,quote_sign,quote_abs
0,2019-08-03 17:04:10.179,2.6e-05,2.6e-05,2.6e-05,2.6e-05,-526.85,526.85,-0.013487,0.013487
1,2019-08-03 17:04:32.735,2.6e-05,2.6e-05,2.6e-05,2.6e-05,-31.13,31.13,-0.000797,0.000797


<h4>Calculate Imbalance ratios</h4>

In [13]:
window_sizes: List[timedelta] = [
    timedelta(days=days) for days in [1, 7, 14, 30]
]
window_names: List[str] = ["1d", "7d", "14d", "30d"]

imbalance_features = {}

for window_size, window_name in zip(window_sizes, window_names):
    df_trades_window: pd.DataFrame = df_trades[
        df_trades["time"] >= pump.time.round("1h") - timedelta(hours=1) - window_size
    ]
    imbalance_ratio: float = df_trades_window["qty_sign"].sum() / df_trades_window["qty_abs"].sum()
    imbalance_features[f"imbalance_ratio_{window_name}"] = imbalance_ratio

In [14]:
imbalance_features

{'imbalance_ratio_1d': -0.010126946693593111,
 'imbalance_ratio_7d': -0.000642763609068722,
 'imbalance_ratio_14d': -0.09333978800139883,
 'imbalance_ratio_30d': -0.08067029554323966}

<h4>Hourly Log returns features</h4>

In [15]:
df_trades_1h: pd.DataFrame = (
    df_trades
    .resample(
        rule="1h", on="time", label="right", closed="right"
    )
    .agg(
        qty_abs=("qty_abs", "sum"),
        price_first=("price_first", "first"),
        price_last=("price_last", "last"),
        num_trades=("qty_sign", "count"),
    )
)

df_trades_1h.head(2)

Unnamed: 0_level_0,qty_abs,price_first,price_last,num_trades
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-08-03 18:00:00,11830.9,2.6e-05,2.5e-05,26
2019-08-03 19:00:00,137946.18,2.5e-05,2.7e-05,242


In [16]:
# calculate log returns
df_trades_1h["price_last"] = df_trades_1h["price_last"].ffill()

df_trades_1h["log_return_1h"] = np.log(
    df_trades_1h["price_last"] / df_trades_1h["price_last"].shift(1)
)

df_trades_1h = df_trades_1h[df_trades_1h["log_return_1h"].notna()]
df_trades_1h = df_trades_1h.reset_index()
df_trades_1h.head(2)

Unnamed: 0,time,qty_abs,price_first,price_last,num_trades,log_return_1h
0,2019-08-03 19:00:00,137946.18,2.5e-05,2.7e-05,242,0.053653
1,2019-08-03 20:00:00,22686.68,2.7e-05,2.7e-05,65,0.003724


<p>Calculate means, sds of Log returns using different window sizes prior to the pump</p>

In [17]:
log_returns_features = {}

for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_trades_1h[
        df_trades_1h["time"] >= pump.time.round("1h") - timedelta(hours=1) - window_size
    ]
    # Add features
    log_returns_features[f"log_return_1h_mean_{window_name}"] = df_window["log_return_1h"].mean()
    log_returns_features[f"log_return_1h_std_{window_name}"] = df_window["log_return_1h"].std()

log_returns_features

{'log_return_1h_mean_1d': 0.002390985720589208,
 'log_return_1h_std_1d': 0.007453618516977809,
 'log_return_1h_mean_7d': 0.0009148972913921206,
 'log_return_1h_std_7d': 0.008615054913220678,
 'log_return_1h_mean_14d': 0.00047130606987897745,
 'log_return_1h_std_14d': 0.009844051093317421,
 'log_return_1h_mean_30d': 0.00012720556017540194,
 'log_return_1h_std_30d': 0.009435235729309311}

<h4>Number of trades features</h4>

In [18]:
num_trades_features = {}

for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_trades_1h[
        df_trades_1h["time"] >= pump.time.round("1h") - timedelta(hours=1) - window_size
    ]
    # Add features
    num_trades_features[f"num_returns_mean_{window_name}"] = df_window["num_trades"].mean()
    num_trades_features[f"num_trades_std_{window_name}"] = df_window["num_trades"].std()

num_trades_features

{'num_returns_mean_1d': 35.2,
 'num_trades_std_1d': 62.42662359389088,
 'num_returns_mean_7d': 24.224852071005916,
 'num_trades_std_7d': 36.36582118042705,
 'num_returns_mean_14d': 24.96439169139466,
 'num_trades_std_14d': 35.163545309750596,
 'num_returns_mean_30d': 22.74479889042996,
 'num_trades_std_30d': 32.96818843378162}

<h4>Quote asset features</h4>

In [19]:
quote_features = {}

window_sizes = [
    timedelta(days=day) for day in [1, 7, 14, 30]
]
window_names = ["1d", "7d", "14d", "30d"]

for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_trades[
        df_trades["time"] >= pump.time.round("1h") - timedelta(hours=1) - window_size
    ]
    # Add features
    quote_features[f"quote_mean_{window_name}"] = df_window["quote_abs"].mean()
    quote_features[f"quote_std_{window_name}"] = df_window["quote_abs"].std()

# quote_features

quote_features

{'quote_mean_1d': 0.01774565036754891,
 'quote_std_1d': 0.059383074769567015,
 'quote_mean_7d': 0.01666711783518202,
 'quote_std_7d': 0.05586982872384882,
 'quote_mean_14d': 0.0184559460769957,
 'quote_std_14d': 0.07372494467037488,
 'quote_mean_30d': 0.017850210055144408,
 'quote_std_30d': 0.06940183127477087}

<h4>Exchange ticker volume</h4>

In [20]:
pump.ticker

'NXSBTC'

In [23]:
df_cmc = pd.read_parquet(
    os.path.join(ROOT_DIR, "data/cmc/cmc_snapshots.parquet")
)

df_cmc["snapshot"] = pd.to_datetime(
    df_cmc["snapshot"], format="%Y%m%d"
)

df_cmc_ticker: pd.DataFrame = df_cmc[
    df_cmc["symbol"] == "NAS"
].copy()

In [25]:
df_trades["date"] = df_trades["time"].dt.floor("1d")

df_vol = df_trades.groupby("date")["quote_abs"].sum().to_frame().reset_index()
df_vol = df_vol[
    df_vol["date"] < pump.time.floor("1d")
].copy()

df_vol: pd.DataFrame = df_vol.merge(
    df_cmc_ticker[["snapshot", "trading_volume_btc"]], left_on="date", right_on="snapshot", how="left"
)

df_vol.head()

Unnamed: 0,date,quote_abs,snapshot,trading_volume_btc
0,2019-08-03,6.20881,2019-08-03,956.671008
1,2019-08-04,12.75603,2019-08-04,534.394802
2,2019-08-05,13.109295,2019-08-05,765.709136
3,2019-08-06,17.245884,2019-08-06,488.178229
4,2019-08-07,9.139646,2019-08-07,435.306469


In [34]:
daily_exchange_vol_features = {}


for window_size, window_name in zip(window_sizes, window_names):
    df_window: pd.DataFrame = df_vol[
        df_vol["date"] >= pump.time.floor("1d") - window_size
    ].copy()

    df_window["daily_exchange_vol_ratio"] = df_window["quote_abs"] / df_window["trading_volume_btc"]
    daily_exchange_vol_features[f"daily_exchange_vol_ratio_{window_name}_mean"] = (
        df_window["daily_exchange_vol_ratio"].mean() 
    )
    daily_exchange_vol_features[f"daily_exchange_vol_ratio_{window_name}_std"] = (
        df_window["daily_exchange_vol_ratio"].std() 
    )

In [36]:
pump.time

Timestamp('2019-10-02 18:00:06')

In [37]:
df_vol.tail()

Unnamed: 0,date,quote_abs,snapshot,trading_volume_btc
55,2019-09-27,3.455373,2019-09-27,379.346921
56,2019-09-28,4.939172,2019-09-28,420.595658
57,2019-09-29,8.21449,2019-09-29,450.348867
58,2019-09-30,7.250531,2019-09-30,665.522217
59,2019-10-01,21.746591,2019-10-01,397.240576
