In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
from sqlalchemy import Float, cast, func, select

REPO_ROOT = Path.cwd().resolve().parent
if str(REPO_ROOT) not in sys.path:
    sys.path.append(str(REPO_ROOT))

from backend.app.db.session import SessionLocal
from backend.app.models import TradeLabel

plt.style.use("seaborn-v0_8-darkgrid")

ModuleNotFoundError: No module named 'matplotlib'

: 

In [None]:
def load_label_summary(secids=None, timeframe="1m", label_set="basic_v1") -> pd.DataFrame:
    stmt = (
        select(
            TradeLabel.secid,
            TradeLabel.timeframe,
            TradeLabel.horizon_minutes,
            func.count().label("rows"),
            func.min(TradeLabel.signal_time).label("min_time"),
            func.max(TradeLabel.signal_time).label("max_time"),
            func.avg(TradeLabel.forward_return_pct).label("avg_forward_return_pct"),
            func.avg(cast(TradeLabel.label_long, Float)).label("long_hit_rate"),
            func.avg(cast(TradeLabel.label_short, Float)).label("short_hit_rate"),
        )
        .where(TradeLabel.label_set == label_set)
        .group_by(TradeLabel.secid, TradeLabel.timeframe, TradeLabel.horizon_minutes)
        .order_by(TradeLabel.secid, TradeLabel.timeframe, TradeLabel.horizon_minutes)
    )
    if secids:
        stmt = stmt.where(TradeLabel.secid.in_(secids))
    if timeframe:
        stmt = stmt.where(TradeLabel.timeframe == timeframe)
    with SessionLocal() as session:
        return pd.read_sql(stmt, session.bind)


def load_label_slice(secid, timeframe, horizon_minutes, label_set="basic_v1", limit=5000) -> pd.DataFrame:
    stmt = (
        select(
            TradeLabel.signal_time,
            TradeLabel.forward_return_pct,
            TradeLabel.max_runup_pct,
            TradeLabel.max_drawdown_pct,
            TradeLabel.label_long,
            TradeLabel.label_short,
        )
        .where(
            TradeLabel.secid == secid,
            TradeLabel.timeframe == timeframe,
            TradeLabel.horizon_minutes == horizon_minutes,
            TradeLabel.label_set == label_set,
        )
        .order_by(TradeLabel.signal_time.asc())
        .limit(limit)
    )
    with SessionLocal() as session:
        return pd.read_sql(stmt, session.bind)


summary_df = load_label_summary(secids=["SBER", "GAZP"], timeframe="1m")
summary_df

Покрытие по SBER/GAZP: видим периоды и базовые hit-rate'ы для long/short по каждому горизонту. Ниже — взгляд на распределение forward_return для конкретного тикера/горизонта.

In [None]:
sample_df = load_label_slice("SBER", "1m", 60)
sample_df["label_long"] = sample_df["label_long"].astype(bool)
sample_df["label_short"] = sample_df["label_short"].astype(bool)
sample_df.head()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
sample_df["forward_return_pct"].hist(ax=axes[0], bins=40, color="steelblue")
axes[0].set_title("Forward return % (SBER, 60m)")
axes[0].set_xlabel("return %")
axes[0].axvline(0, color="black", linestyle="--", linewidth=1)

sample_df[["max_runup_pct", "max_drawdown_pct"]].plot(kind="box", ax=axes[1], color="darkorange")
axes[1].set_title("Runup vs drawdown (SBER, 60m)")
axes[1].set_ylabel("%")
plt.tight_layout()
plt.show()

In [None]:
label_rates = (
    sample_df.assign(
        label_long=sample_df["label_long"].astype(int),
        label_short=sample_df["label_short"].astype(int),
    )[["label_long", "label_short"]]
    .mean()
    .rename("rate")
)
label_rates

Набросок baseline-пайплайна: 
1. Вытащить за тот же интервал `FeatureNumeric` и сделать пивот в широкую таблицу.
2. Сопоставить по `feature_windows.window_end == trade_labels.signal_time`.
3. Поскорить baseline-классификатор (например, LightGBM/RandomForest) для бинарной цели `label_long`.

Следующие шаги в коде/исследованиях: реализовать merge features+labels, добавить дополнительные графики (scatter runup/drawdown vs forward_return) и запланировать baseline эксперимент.

## Baseline: merge features + labels

Ниже загружаем подготовленный CSV (см. `backend/scripts/ml/prepare_baseline_dataset.py`), делим выборку по времени и обучаем простой RandomForest по цели `label_long`. Это первый ориентир по качеству, на который можно опираться при дальнейшем развитии ML.

In [None]:
DATASET_PATH = REPO_ROOT / "data" / "training" / "baseline_dataset.csv"
baseline_df = pd.read_csv(DATASET_PATH, parse_dates=["signal_time"])
baseline_df.head()

In [None]:
META_COLS = {
    "secid",
    "timeframe",
    "feature_set",
    "label_set",
    "signal_time",
    "horizon_minutes",
    "forward_return_pct",
    "max_runup_pct",
    "max_drawdown_pct",
    "label_long",
    "label_short",
}

target_col = "label_long"
feature_cols = [col for col in baseline_df.columns if col not in META_COLS]

baseline_df = (
    baseline_df
    .sort_values("signal_time")
    .assign(label_long=baseline_df[target_col].astype(int))
)

split_idx = int(len(baseline_df) * 0.7)
train_df = baseline_df.iloc[:split_idx].copy()
test_df = baseline_df.iloc[split_idx:].copy()

X_train = train_df[feature_cols].fillna(0)
X_test = test_df[feature_cols].fillna(0)
y_train = train_df[target_col].astype(int)
y_test = test_df[target_col].astype(int)

len(train_df), len(test_df), len(feature_cols)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
)
rf_model.fit(X_train, y_train)

y_proba = rf_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)

print(classification_report(y_test, y_pred, digits=4))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.4f}")

In [None]:
feature_importance = (
    pd.Series(rf_model.feature_importances_, index=feature_cols)
    .sort_values(ascending=False)
    .head(10)
)
feature_importance.to_frame(name="importance")

## Diagnostic look at the merged dataset

Быстрые срезы по CSV помогают контролировать тайминги и дисбаланс классов перед тем, как запускать более тяжёлые эксперименты.

In [None]:
dataset_summary = pd.Series(
    {
        "rows": len(baseline_df),
        "time_start": baseline_df["signal_time"].min().isoformat(),
        "time_end": baseline_df["signal_time"].max().isoformat(),
        "secids": ", ".join(sorted(baseline_df["secid"].unique())),
        "timeframes": ", ".join(sorted(baseline_df["timeframe"].unique())),
        "horizons": ", ".join(map(str, sorted(baseline_df["horizon_minutes"].unique()))),
        "long_rate": baseline_df["label_long"].mean(),
        "short_rate": baseline_df["label_short"].mean(),
    }
).to_frame(name="value")
dataset_summary

In [None]:
label_distribution = (
    baseline_df.groupby(["secid", "horizon_minutes"])
    .agg(rows=("label_long", "size"), long_rate=("label_long", "mean"), short_rate=("label_short", "mean"))
    .reset_index()
    .sort_values(["secid", "horizon_minutes"])
)
label_distribution.head(10)

In [None]:
label_pivot = label_distribution.pivot(index="secid", columns="horizon_minutes", values="long_rate")
ax = label_pivot.plot(kind="bar", figsize=(10, 4))
ax.set_ylabel("long hit-rate")
ax.set_xlabel("secid")
ax.set_title("Label_long rate by secid × horizon")
plt.legend(title="horizon (min)", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()