In [2]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier

from typing import Union
from numba import njit

NANOSECOND = 1
MICROSECOND = 1000
MILLISECOND = 1000000
SECOND = 1000000000

from typing import Union


def trades_balance(trades_df: pd.DataFrame, window: Union[str, int]) -> pd.Series:
    sells = trades_df["ask_amount"].rolling(window=window, min_periods=1).sum()
    buys = trades_df["bid_amount"].rolling(window=window, min_periods=1).sum()
    return (sells - buys) / (sells + buys + 1e-8)


def calc_imbalance(lobs):
    """
    Computes the order book imbalance.

    Parameters:
    - lob: pd.DataFrame row containing LOB data.

    Returns:
    - imbalance_value: float
    """
    bid_amount = lobs["bids[0].amount"]
    ask_amount = lobs["asks[0].amount"]
    imbalance_value = (bid_amount - ask_amount) / (bid_amount + ask_amount)
    return imbalance_value


def vwap(books_df: pd.DataFrame, lvl_count: int) -> pd.Series:
    """Volume-weighted average price."""
    ask_weighted_price = sum(
        books_df[f"asks[{i}].price"] * books_df[f"asks[{i}].amount"]
        for i in range(lvl_count)
    )
    ask_volume = sum(books_df[f"asks[{i}].amount"] for i in range(lvl_count))

    bid_weighted_price = sum(
        books_df[f"bids[{i}].price"] * books_df[f"bids[{i}].amount"]
        for i in range(lvl_count)
    )
    bid_volume = sum(books_df[f"bids[{i}].amount"] for i in range(lvl_count))

    total_weighted_price = ask_weighted_price + bid_weighted_price
    total_volume = ask_volume + bid_volume

    vwap = total_weighted_price / total_volume

    return vwap / books_df["mid_price"]


class Predictor:
    def __init__(self, full_model_path: Union[str, list[str]]):
        self.model = CatBoostClassifier()
        self.model.load_model(full_model_path, format="cbm")
        # Retrieve all parameters
        params = self.model.get_all_params()
        # Print each parameter and its value
        for param, value in params.items():
            print(f"{param}: {value}")

    @staticmethod
    def model_name() -> Union[str, list[str]]:
        return "20241120-134956_model_20241120-033256_model_baseline.cbm"

    def predict(self, features: pd.DataFrame) -> pd.Series:
        """
        Method is called once every time new submission received
            Params:
                Your features returned from `calc_features` method

            Returns: pd.Series[float]
                Array of predicted returns (price_{n + 1} / price_{n} - 1).
                One value must be generated for every bbo dataframe timestamp
                so that len(Series) == len(bbos)
        """

        predict = pd.Series(self.model.predict_proba(features)[:, 1])

        return predict

    def calc_features(
        self,
        lobs: pd.DataFrame | None,
        agg_trades: pd.DataFrame | None,
        lobs_embedding: pd.DataFrame | None,
        target_data: pd.DataFrame | None,
    ) -> pd.DataFrame:
        """
        Calculates features using provided functions and aligns them with target_data.

        Parameters:
        - lobs: pd.DataFrame of limit orderbooks.
        - agg_trades: pd.DataFrame of aggregated trades.
        - lobs_embedding: pd.DataFrame of embedding over limit orderbooks.
        - target_data: pd.DataFrame with target timestamps.

        Returns:
        - features: pd.DataFrame with features aligned to target_data.index.
        """
        lobs["mid_price"] = (lobs["asks[0].price"] + lobs["bids[0].price"]) / 2

        btcusdt_mid_price = lobs_embedding[lobs_embedding["instrument"] == "BTCUSDT"][
            "mid_price"
        ]
        ethusdt_mid_price = lobs_embedding[lobs_embedding["instrument"] == "ETHUSDT"][
            "mid_price"
        ]

        main_btcusdt_dev = (
            lobs["mid_price"] / (btcusdt_mid_price.asof(lobs.index) + 1e-6)
        ).asof(target_data.index) * target_data.side
        main_btcusdt_dev.name = "main_btcusdt_dev"

        main_ethusdt_dev = (
            lobs["mid_price"] / (ethusdt_mid_price.asof(lobs.index) + 1e-6)
        ).asof(target_data.index) * target_data.side
        main_ethusdt_dev.name = "main_ethusdt_dev"

        distance_to_mid_price = (
            target_data.price / (lobs["mid_price"].asof(target_data.index) + 1e-6) - 1
        ) * target_data.side
        distance_to_mid_price.name = "distance_to_mid_price"

        imbalance_series = (
            calc_imbalance(lobs).asof(target_data.index) * target_data.side
        )
        imbalance_series.name = "imbalance"

        depth = 5
        vwap_series = vwap(lobs, depth).asof(target_data.index) * target_data.side
        vwap_series.name = "vwap"

        solusdt_agg_trades = agg_trades[agg_trades["instrument"] == "SOLUSDT"]
        solusdt_agg_trades.index = pd.to_datetime(solusdt_agg_trades.index)
        trades_ratio_series = (
            trades_balance(solusdt_agg_trades, 10 * SECOND).asof(target_data.index)
            * target_data.side
        )
        trades_ratio_series.name = "trades_ratio"

        return pd.concat(
            [
                target_data.side,
                vwap_series,
                trades_ratio_series,
                distance_to_mid_price,
                main_ethusdt_dev,
                main_btcusdt_dev,
            ],
            axis=1,
        )

In [6]:
from catboost import CatBoostClassifier
from pathlib import Path

# Initialize an empty CatBoostClassifier
model = CatBoostClassifier()

BASE_DIR = Path.cwd().parent  # Moves up one level from the current notebook's directory

# Set the data folder path relative to the base directory
DATA_DIR = BASE_DIR / "models"

# Load the model from the .cbm file
model.load_model(f'{DATA_DIR}/model_20241120-033256_model_baseline.cbm', format='cbm')

# Retrieve all parameters
params = model.get_params()

# Print each parameter and its value
for param, value in params.items():
    print(f"{param}: {value}")


use_best_model: True
eval_metric: Logloss
verbose: 50
iterations: 1200
thread_count: 13
loss_function: Logloss
l2_leaf_reg: 50
task_type: GPU
depth: 5
learning_rate: 0.01


In [7]:
from catboost import CatBoostClassifier
from pathlib import Path

# Initialize an empty CatBoostClassifier
model = CatBoostClassifier()

BASE_DIR = Path.cwd().parent  # Moves up one level from the current notebook's directory

# Set the data folder path relative to the base directory
DATA_DIR = BASE_DIR / "models"

# Load the model from the .cbm file
model.load_model(f'{DATA_DIR}/model_20241120-033256_model_baseline.cbm', format='cbm')

# Retrieve all parameters
params = model.get_all_params()

# Print each parameter and its value
for param, value in params.items():
    print(f"{param}: {value}")


nan_mode: Min
gpu_ram_part: 0.95
eval_metric: Logloss
iterations: 1200
leaf_estimation_method: Newton
observations_to_bootstrap: TestOnly
random_score_type: NormalWithModelSizeDecrease
grow_policy: SymmetricTree
penalties_coefficient: 1
boosting_type: Plain
feature_border_type: GreedyLogSum
bayesian_matrix_reg: 0.1000000015
devices: -1
eval_fraction: 0
pinned_memory_bytes: 104857600
force_unit_auto_pair_weights: False
l2_leaf_reg: 50
random_strength: 1
rsm: 1
boost_from_average: False
gpu_cat_features_storage: GpuRam
fold_size_loss_normalization: False
model_size_reg: 0.5
pool_metainfo_options: {'tags': {}}
use_best_model: True
meta_l2_frequency: 0
class_names: [0, 1]
random_seed: 0
depth: 5
border_count: 128
min_fold_size: 100
data_partition: DocParallel
bagging_temperature: 1
classes_count: 0
auto_class_weights: None
leaf_estimation_backtracking: AnyImprovement
best_model_min_trees: 1
min_data_in_leaf: 1
add_ridge_penalty_to_loss_function: False
loss_function: Logloss
learning_rate: 

In [11]:
from datetime import datetime, timedelta, timezone

# Define the GMT+3 timezone
gmt_plus_3 = timezone(timedelta(hours=3))

# Get the current time in GMT+3
now_gmt_plus_3 = datetime.now(gmt_plus_3)

# Generate a human-readable timestamp with GMT+3
timestamp = now_gmt_plus_3.strftime("%Y-%b-%d_%H-%M")

# Create the model filename
model_name = f"model_with_tscv_weights_no_preprocessed_{timestamp}.cbm"

print(model_name)

model_with_tscv_weights_no_preprocessed_2024-Dec-08_17-57.cbm
