<h4>Mironov Mikhail. Master Thesis. Main research notebook</h4>

In [1]:
from datetime import date, datetime, timedelta
from core.time_utils import Bounds
from core.exchange import Exchange
from core.currency import Currency
from core.utils import configure_logging
from ml_base.features import FeatureFilter, get_importance_file_path
from ml_base.metrics import log_lgbm_iteration_to_stdout

from lightgbm import Booster

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.simplefilter("ignore")

configure_logging()

In [2]:
# Load features that are the most impactful
feature_filter: FeatureFilter = FeatureFilter.from_importance(
    get_importance_file_path(day=date(2025, 5, 25), target_exchange=Exchange.BINANCE_SPOT),
    use_first=25
)

feature_filter.allowed_features[:10]

2025-05-26 12:42:00,082 | INFO | root | Loading feature importance file D:\microstructure_ranker\src\models\prediction\feature_importances\BINANCE_SPOT-importances@20250525.csv


['SELF-flow_imbalance-2S@BINANCE_USDM',
 'ETH-asset_return-5S@BINANCE_SPOT',
 'SELF-asset_return-5S@BINANCE_SPOT',
 'currency_index',
 'SELF-flow_imbalance-2S@BINANCE_SPOT',
 'SELF-flow_imbalance-1S@BINANCE_USDM',
 'BTC-asset_return-1S@BINANCE_SPOT',
 'BTC-asset_return-500MS@BINANCE_SPOT',
 'SELF-share_of_long_trades-1S@BINANCE_USDM',
 'ETH-asset_return-2S@BINANCE_SPOT']

<h4>Define bounds for TRAIN and TEST samples</h4>

In [3]:
train_bounds: Bounds = Bounds.for_days(
    date(2025, 4, 1), date(2025, 5, 5)
)

<h4>Build model manually</h4>


<p>Using BuildDataset read all features and split them into TRAIN and VALIDATION samples</p>

In [None]:
from core.currency import get_target_currencies
from models.prediction.build_sample import BuildDataset
from ml_base.sample import MLDataset, SampleParams, Sample
from ml_base.enums import DatasetType


sample: Sample = (
    BuildDataset(
        target_exchange=Exchange.BINANCE_SPOT,
        feature_filter=feature_filter,
        target_currencies=get_target_currencies(),
        forecast_step=timedelta(seconds=5)
    )
    .create_sample(
        bounds=train_bounds,
        sample_params=SampleParams(train_share=.8, validation_share=.2),
    )
)

<h4>Visualize data</h4>

In [8]:
df_train = sample.get_data(ds_type=DatasetType.TRAIN)
df_val = sample.get_data(ds_type=DatasetType.VALIDATION)

df_train.shape, df_val.shape

((84599286, 25), (21149831, 25))

In [10]:
df_train.sample(int(5 * 1e6), replace=False).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SELF-flow_imbalance-500MS@BINANCE_USDM,2720210.0,0.006388,0.913442,-1.0,-1.0,0.023482,1.0,1.0
SELF-flow_imbalance-500MS@OKX_SPOT,861171.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
SELF-powerlaw_alpha-500MS@BINANCE_SPOT,1730774.0,inf,,-1.216945e+32,1.703604,,,inf
SELF-powerlaw_alpha-500MS@BINANCE_USDM,2720210.0,inf,,-5.40432e+16,1.460614,2.185907,,inf
SELF-powerlaw_alpha-500MS@OKX_SPOT,861171.0,inf,,-8.106479e+16,1.884378,,,inf
SELF-slippage_imbalance-500MS@BINANCE_SPOT,3286281.0,0.066737,0.918628,-1.0,-1.0,0.363254,1.0,1.0
SELF-slippage_imbalance-500MS@BINANCE_USDM,4577068.0,0.01742,0.843338,-1.0,-0.947078,0.049272,0.965924,1.0
SELF-asset_return-1S@BINANCE_SPOT,5000000.0,-0.001015,3.651142,-2524.376,0.0,0.0,0.0,1940.784341
SELF-flow_imbalance-1S@BINANCE_SPOT,2503719.0,0.041468,0.916828,-1.0,-1.0,0.207924,1.0,1.0
SELF-flow_imbalance-1S@BINANCE_USDM,3513471.0,0.00662,0.875194,-1.0,-0.995842,0.016392,0.998263,1.0


<h4>Train the model with early stopping on the validation sample</h4>

In [None]:
# Now train the model using LightGBM
from lightgbm import Booster, record_evaluation
from typing import *
import lightgbm as lgb
import os


_BASE_PARAMS: Dict[str, Any] = {
    "objective": "mse",
    "max_depth": 5,
    "learning_rate": 0.02,
    "n_estimators": 120,
    "num_threads": os.cpu_count() - 1,
    "verbose": -1
}

evals_result = {}

train: lgb.Dataset = sample.get_lgb_dataset(ds_type=DatasetType.TRAIN)
validation: lgb.Dataset = sample.get_lgb_dataset(ds_type=DatasetType.VALIDATION)

booster: Booster = lgb.train(
    params=_BASE_PARAMS,
    train_set=train,
    valid_sets=[train, validation],
    valid_names=["train", "validation"],
    callbacks=[
        record_evaluation(evals_result),
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        log_lgbm_iteration_to_stdout
    ]
)

In [None]:
plt.plot(evals_result["train"]["l2"])
plt.plot(evals_result["validation"]["l2"])

In [None]:
from sklearn.metrics import r2_score, classification_report, accuracy_score

r2_score(y_pred=y_pred, y_true=dataset.label)

In [None]:
y_pred_binary: np.ndarray = (y_pred > 0).astype(int)
y_true_binary: np.ndarray = (dataset.label > 0).astype(int)

print(
    classification_report(y_pred=y_pred_binary, y_true=y_true_binary)
)

In [None]:
accuracy_score(y_pred=y_pred_binary, y_true=y_true_binary)

In [None]:
dataset.label.describe().to_frame()

In [None]:
booster.feature_importance()

In [None]:
is_btc = dataset.data["currency_index"] == Currency.BTC.value
dataset.data.head(10)

In [None]:
import plotly.graph_objects as go


fig: go.Figure = go.Figure()
fig.add_trace(
    go.Scatter(x=data)
)