<h4>Mironov Mikhail. Master Thesis. Main research notebook</h4>

In [1]:
import warnings
from datetime import date, timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from core.currency import Currency
from core.exchange import Exchange
from core.time_utils import Bounds
from core.utils import configure_logging
from ml_base.features import FeatureFilter, get_importance_file_path
from ml_base.metrics import log_lgbm_iteration_to_stdout
from typing import *

warnings.simplefilter("ignore")

configure_logging()

<h4>Display how raw data looks</h4>

In [None]:
from core.paths import BINANCE_SPOT_HIVE_TRADES
from core.time_utils import Bounds
from datetime import date

import pandas as pd
import polars as pl


bounds: Bounds = Bounds.for_day(date(2025, 5, 1))

df = (
    pl.scan_parquet(BINANCE_SPOT_HIVE_TRADES, hive_partitioning=True)
    .filter(
        (pl.col("symbol") == "ADA-USDT") &
        (pl.col("date") == bounds.day0)
    )
    .head(5)
    .collect()
)

<h4>Load most significant features</h4>

In [3]:
# Load features that are the most impactful
feature_filter: FeatureFilter = FeatureFilter.from_importance(
    get_importance_file_path(
        day=date(2025, 5, 25),
        target_exchange=Exchange.BINANCE_SPOT,
        forecast_step=timedelta(seconds=3)
    ),
    use_first=25
)

feature_filter.allowed_features[:10]

2025-05-27 13:50:53,255 | INFO | root | Loading feature importance file D:\microstructure_ranker\src\models\prediction\artifacts\feature_importances\BINANCE_SPOT\BINANCE_SPOT-importances-3S@20250525.csv


['ETH-asset_return-2S@BINANCE_SPOT',
 'ETH-asset_return-1S@BINANCE_SPOT',
 'SELF-flow_imbalance-2S@BINANCE_USDM',
 'currency_index',
 'ETH-asset_return-500MS@BINANCE_SPOT',
 'SELF-flow_imbalance-5S@BINANCE_USDM',
 'SELF-exchange_diff-BINANCE_SPOT-BINANCE_USDM-2S',
 'ETH-asset_return-5S@BINANCE_SPOT',
 'SELF-flow_imbalance-1S@BINANCE_USDM',
 'SELF-exchange_diff-BINANCE_SPOT-BINANCE_USDM-500MS']

<h4>Define bounds for TRAIN and TEST samples</h4>

In [None]:
train_bounds: Bounds = Bounds.for_days(
    date(2025, 4, 1), date(2025, 5, 5)
)

<h4>Build model manually</h4>


<p>Using BuildDataset read all features and split them into TRAIN and VALIDATION samples</p>

In [None]:
from core.currency import get_target_currencies
from models.prediction.build_sample import BuildDataset
from ml_base.sample import SampleParams, Sample, MLDataset
from ml_base.enums import DatasetType

sample: Sample = (
    BuildDataset(
        target_exchange=Exchange.OKX_SPOT,
        feature_filter=feature_filter,
        target_currencies=get_target_currencies(),
        forecast_step=timedelta(seconds=5),
    )
    .create_sample(
        bounds=train_bounds,
        sample_params=SampleParams(train_share=.8, validation_share=.2),
    )
)

<h4>Visualize data</h4>

<p>We can get DataFrame from MLDataset</p>

In [None]:
df_train = sample.get_data(ds_type=DatasetType.TRAIN)
df_val = sample.get_data(ds_type=DatasetType.VALIDATION)

df_train.shape, df_val.shape

<p>In MLDataset.eval_fields we have stored asset_hold_time. Now we will check what was the actual time between trades used to compute returns for different time horizons</p>

In [None]:
train_eval: pd.DataFrame = sample.get_eval_data(ds_type=DatasetType.TRAIN)
val_eval: pd.DataFrame = sample.get_eval_data(ds_type=DatasetType.VALIDATION)

In [None]:
currencies: List[Currency] = [Currency.BTC, Currency.ETH, Currency.HBAR]
vals: List[pd.Series] = []

for currency in currencies:
    mask = df_train["currency_index"] == currency.value
    vals.append(train_eval[mask].mean())

In [None]:
# Get hold_time for different windows for less traded currency like HBAR
df_liquidity = pd.DataFrame(vals).T
df_liquidity.columns = [currency.name for currency in currencies]


df_liquidity

In [None]:
df_train.sample(int(5 * 1e6), replace=False).describe().T

<h4>Train the model with early stopping on the validation sample</h4>

In [None]:
# Now train the model using LightGBM
from lightgbm import Booster, record_evaluation
from typing import *
import lightgbm as lgb
import os

_BASE_PARAMS: Dict[str, Any] = {
    "objective": "mse",
    "max_depth": 10,
    "learning_rate": 0.05,
    "n_estimators": 120,
    "subsample": 0.7,
    "num_threads": os.cpu_count() - 1,
    "verbose": -1
}

evals_result = {}

train: lgb.Dataset = sample.get_lgb_dataset(ds_type=DatasetType.TRAIN)
validation: lgb.Dataset = sample.get_lgb_dataset(ds_type=DatasetType.VALIDATION)

booster: Booster = lgb.train(
    params=_BASE_PARAMS,
    train_set=train,
    valid_sets=[train, validation],
    valid_names=["train", "validation"],
    callbacks=[
        record_evaluation(evals_result),
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        log_lgbm_iteration_to_stdout
    ]
)

In [None]:
fig, ax1 = plt.subplots()

ax1.set_xlabel('Iteration')
ax1.plot(evals_result["train"]["l2"], label='Train L2', color="red")

# create a second y-axis sharing the same x
ax2 = ax1.twinx()
ax2.plot(evals_result["validation"]["l2"], label='Validation L2', color="blue")

fig.legend()

<h4>Evaluate the model</h4>

In [None]:
val_ds: MLDataset = sample.get_dataset(ds_type=DatasetType.VALIDATION)
y_pred: np.ndarray = booster.predict(val_ds.data, num_iteration=booster.best_iteration)

In [None]:
from sklearn.metrics import r2_score, classification_report, accuracy_score

r2_score(y_pred=y_pred, y_true=val_ds.label)

In [None]:
y_pred_binary: np.ndarray = (y_pred > 0).astype(int)
y_true_binary: np.ndarray = (val_ds.label > 0).astype(int)

print(
    classification_report(y_pred=y_pred_binary, y_true=y_true_binary)
)

In [None]:
accuracy_score(y_pred=y_pred_binary, y_true=y_true_binary)

In [None]:
# display r2 by currency
from ml_base.metrics import compute_metrics


compute_metrics(
    booster=booster, dataset=val_ds, target_currencies=get_target_currencies()
)

<h4>Load feature importances and statistics</h4>

In [None]:
from models.prediction.horizon import get_statitics_path, HORIZONS
from core.time_utils import get_seconds_slug
from ml_base.features import get_importance_file_path
from pathlib import Path
import pandas as pd


target_exchange: Exchange = Exchange.BINANCE_SPOT
ref_day: date = date(2025, 5, 25)


def load_stats(target_exchange: Exchange, ref_day: date) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []
    
    for forecast_step in HORIZONS:
        path: Path = get_statitics_path(
            target_exchange=target_exchange, forecast_step=forecast_step, day=ref_day
        )
    
        df_stats: pd.DataFrame = pd.read_csv(path)
        df_stats["forecast_seconds"] = get_seconds_slug(td=forecast_step)
        dfs.append(df_stats)
    return pd.concat(dfs)

In [None]:
binance_spot: pd.DataFrame = load_stats(target_exchange=Exchange.BINANCE_SPOT, ref_day=ref_day)
binance_usdm: pd.DataFrame = load_stats(target_exchange=Exchange.BINANCE_USDM, ref_day=ref_day)
okx_spot: pd.DataFrame = load_stats(target_exchange=Exchange.OKX_SPOT, ref_day=ref_day)

In [None]:
binance_spot = binance_spot.set_index(['currency', 'forecast_seconds'])
binance_usdm = binance_usdm.set_index(['currency', 'forecast_seconds'])
okx_spot = okx_spot.set_index(['currency', 'forecast_seconds'])

# 2. give each block of columns a top–level name
binance_spot.columns = pd.MultiIndex.from_product([['BINANCE_SPOT'], binance_spot.columns])
binance_usdm.columns = pd.MultiIndex.from_product([['BINANCE_USDM'], binance_usdm.columns])
okx_spot.columns = pd.MultiIndex.from_product([['OKX_SPOT'], okx_spot.columns])

# 3. concatenate them side by side
df_combined = pd.concat([binance_spot, binance_usdm, okx_spot], axis=1).reset_index()
df_combined = df_combined.set_index(['currency','forecast_seconds'])
df_combined = df_combined.sort_index(level=['currency','forecast_seconds'])

In [None]:
df_combined

In [None]:
df_combined.index[0][0]

In [None]:
df_combined = df_combined.round(3)

# 3. Export to LaTeX, making sure float_format keeps three decimals
latex = df_combined.to_latex(
    index=True,
    multicolumn=True,
    multirow=True,
    float_format="%.3f",
    column_format='ll' + 'rrrr'*2
)

print(latex.replace("_", "-"))

In [None]:
target_exchange: Exchange = Exchange.OKX_SPOT

df_stats: pd.DataFrame = load_stats(target_exchange=target_exchange, ref_day=ref_day)
ax = plt.figure(figsize=(9, 5)).add_subplot()

for currency in df_stats["currency"].unique():
    df_stats[df_stats["currency"] == currency].plot(
        x="forecast_seconds", y="R2", 
        ax=ax, 
        label=currency,
    )


plt.ylabel("R2")
plt.title(f"R2 by currency at {target_exchange.name} against forecast horizon")
ax.legend(loc='center left', bbox_to_anchor=(1.0, 0.5))
plt.tight_layout()
plt.savefig(f"r2_by_currency@{target_exchange.name.lower()}.png")

<h4>Study feature importances</h4>

In [None]:
def load_importances(target_exchange: Exchange, ref_day: date) -> pd.DataFrame:
    dfs: List[pd.DataFrame] = []

    for forecast_step in HORIZONS:
        path: Path = get_importance_file_path(
            target_exchange=target_exchange, forecast_step=forecast_step, day=ref_day
        )
    
        df_importances: pd.DataFrame = pd.read_csv(path)
        df_importances["forecast_seconds"] = forecast_step.total_seconds()
        dfs.append(df_importances)
    
    df_importances: pd.DataFrame = pd.concat(dfs)
    return df_importances

In [None]:
df_spot = load_importances(target_exchange=Exchange.BINANCE_SPOT, ref_day=ref_day)
df_usdm = load_importances(target_exchange=Exchange.BINANCE_USDM, ref_day=ref_day)

In [None]:
import pandas as pd

# ─── 1. Keep only the 3-second horizon ─────────────────────────────────────────
df_spot_3s = df_spot[df_spot['forecast_seconds'] == 3]
df_usdm_3s = df_usdm[df_usdm['forecast_seconds'] == 3]

# ─── 2. Sort by descending importance and take the top 20 ─────────────────────
top_spot_3s = df_spot_3s.sort_values('importance', ascending=False).head(20)
top_usdm_3s = df_usdm_3s.sort_values('importance', ascending=False).head(20)

# ─── 3. Build an empty result DataFrame with ranks 1–20 and two columns ───────
ranks   = range(1, 21)
columns = ['BINANCE_SPOT', 'BINANCE_USDM']
result_3s = pd.DataFrame(index=ranks, columns=columns)

# ─── 4. Populate with feature names ────────────────────────────────────────────
result_3s['BINANCE_SPOT'] = top_spot_3s['feature'].values
result_3s['BINANCE_USDM'] = top_usdm_3s['feature'].values

# ─── 5. (Optional) Fill any missing slots with empty strings ──────────────────
result_3s = result_3s.fillna('')

# ─── Now `result_3s` is a 20×2 DataFrame:
#      index = rank 1…20
#      columns = BINANCE_SPOT, BINANCE_USDM
#      values = feature names sorted by importance for the 3s horizon
print(result_3s.to_latex().replace("_", "-"))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_wide = (
    df_importances
    .pivot(index="feature", columns="forecast_seconds", values="importance")
    .fillna(0)
)

# 2) Normalize within each horizon (optional, so columns sum to 1)
df_norm = df_wide.div(df_wide.sum(axis=0), axis=1)

# 3) Sort features by *mean* importance across all horizons
feature_order = df_norm.mean(axis=1).sort_values(ascending=False).index
df_norm = df_norm.reindex(feature_order)

plt.figure(figsize=(10, 5))
sns.heatmap(
    df_norm.iloc[:20],
    cmap="viridis",
    cbar_kws={"label": "Relative importance"},
    linewidths=0.5,
)
plt.xlabel("Forecast horizon")
plt.ylabel("Feature")
plt.title(f"Top {10} features by normalized importance")
plt.tight_layout()
plt.savefig("feature_importances_against_all_horizons@binance_spot.png")
plt.show()