In [None]:
# 13_feature_ablation_fast_gate_v11.ipynb

# =============================
# 0. Импорты и базовые настройки
# =============================
import json
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_fscore_support,
    confusion_matrix,
)

from catboost import CatBoostClassifier, Pool

pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 200)

PROJECT_ROOT = Path(".")
DATA_ROOT = PROJECT_ROOT / "data"
PROC_PATH = DATA_ROOT / "processed"

FEATURES_PARQUET = PROC_PATH / "features_offline_v11.parquet"

# =============================
# 1. Загрузка данных (с fallback на CSV)
# =============================

if not FEATURES_PARQUET.exists():
    raise FileNotFoundError(f"Файл не найден: {FEATURES_PARQUET}. Сначала запусти ноутбук с build_features_offline.")

print("Пробуем прочитать как Parquet:", FEATURES_PARQUET)

try:
    df = pd.read_parquet(FEATURES_PARQUET)
    print("✅ Успешно прочитан как Parquet.")
except Exception as e:
    print("❌ Не удалось прочитать как Parquet:", repr(e))
    print("Пробуем прочитать как CSV (вдруг файл был сохранён как CSV с расширением .parquet)...")
    # Если ты сохранял как csv; если был ; в качестве разделителя — поменяй sep на ';'
    try:
        df = pd.read_csv(FEATURES_PARQUET)
        print("✅ Успешно прочитан как CSV.")
    except Exception as e2:
        print("❌ Не удалось прочитать и как CSV:", repr(e2))
        raise RuntimeError(
            "Файл features_offline_v11.parquet повреждён или не является табличным. "
            "Лучше всего пересоздать его, ещё раз запустив ноутбук build_features_offline_v11."
        )

print("Всего строк:", len(df))
print("Колонок:", df.shape[1])
print("Первые колонки:", list(df.columns)[:25], "...")

if "target" not in df.columns:
    raise ValueError("В загруженном df нет колонки 'target'. Проверь, что ты загрузил именно Offline Feature Store.")

print("\nРаспределение target по всему датасету:")
print(df["target"].value_counts(dropna=False))
print("Доля фрода:", df["target"].mean())

TARGET_COL = "target"

# =============================
# 2. Базовый список фич (всё, кроме служебных)
# =============================

DROP_COLS = [
    TARGET_COL,
    "docno",
    "row_id",
    "cst_dim_id",
    "direction",
    "transdate",
    "transdatetime",
]

all_feature_cols = [c for c in df.columns if c not in DROP_COLS]

print("\nВсего фич (all_feature_cols):", len(all_feature_cols))

# Категориальные фичи (глобально)
cat_feature_global = [
    c
    for c in all_feature_cols
    if str(df[c].dtype) in ("object", "category")
]

print("Глобальные категориальные фичи:", cat_feature_global)

# Приводим категориальные к строкам, чтобы CatBoost не страдал
for col in cat_feature_global:
    df[col] = df[col].astype(str)

y = df[TARGET_COL].astype(int).values

# =============================
# 3. Разделение фич на группы
# =============================

# 3.1. Session / behavioral features (из нового датасета)
sess_cols = [c for c in df.columns if c.startswith("sess_")]

# 3.2. Node2Vec эмбеддинги
emb_cst_cols = sorted([c for c in df.columns if c.startswith("emb_cst_")])
emb_dir_cols = sorted([c for c in df.columns if c.startswith("emb_dir_")])

# 3.3. Графовые фичи (ручной список, фильтруем по наличию)
graph_candidates = [
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
]
graph_cols = [c for c in graph_candidates if c in df.columns]

# 3.4. Base-фичи = всё остальное (без sess / graph / node2vec)
base_cols = [
    c
    for c in all_feature_cols
    if c not in sess_cols + emb_cst_cols + emb_dir_cols + graph_cols
]

print("\nРазмеры групп фич:")
print(f"  base_cols: {len(base_cols)}")
print(f"  graph_cols: {len(graph_cols)}")
print(f"  sess_cols: {len(sess_cols)}")
print(f"  emb_cst_cols: {len(emb_cst_cols)}")
print(f"  emb_dir_cols: {len(emb_dir_cols)}")

# =============================
# 4. Определяем варианты абляции
# =============================

variants = []

# A: только базовые фичи
variants.append(("A_base", sorted(base_cols)))

# B: базовые + граф
variants.append(("B_base+graph", sorted(base_cols + graph_cols)))

# C: базовые + граф + Node2Vec (но без sess)
variants.append(
    ("C_base+graph+node2vec", sorted(base_cols + graph_cols + emb_cst_cols + emb_dir_cols))
)

# D: full = всё как в v11
variants.append(("D_full_with_session", sorted(all_feature_cols)))

# E: full без сессионок
variants.append(
    ("E_full_without_session", sorted([c for c in all_feature_cols if c not in sess_cols]))
)

# F: full без Node2Vec
variants.append(
    ("F_full_without_node2vec", sorted([c for c in all_feature_cols if c not in emb_cst_cols + emb_dir_cols]))
)

# G: full без графа
variants.append(
    ("G_full_without_graph", sorted([c for c in all_feature_cols if c not in graph_cols]))
)

print("\nВарианты абляции:")
for name, cols in variants:
    print(f"  {name}: {len(cols)} фич")

# =============================
# 5. Общий K-fold (одинаковый для всех вариантов)
# =============================

N_SPLITS = 5
skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=42
)

folds = list(skf.split(df, y))
print(f"\nБудет использовано {N_SPLITS}-fold Stratified CV (одинаковое разбиение для всех вариантов).")

# =============================
# 6. Вспомогательные функции
# =============================

def eval_at_threshold(y_true, y_proba, threshold: float):
    """Метрики при заданном пороге."""
    y_pred = (y_proba >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="binary", zero_division=0
    )
    return {
        "threshold": threshold,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "tp": int(tp),
        "fp": int(fp),
        "fn": int(fn),
        "tn": int(tn),
    }


def run_variant(name: str, feature_list, verbose_training: bool = True):
    """
    Обучаем CatBoost с k-fold CV на заданных фичах, считаем OOF-метрики.
    Возвращаем словарь с метриками для сводной таблицы.
    """
    print("\n" + "=" * 80)
    print(f"Вариант: {name}")
    print(f"Число фич: {len(feature_list)}")
    print("=" * 80)

    X_var = df[feature_list]

    # cat-фичи для этого варианта
    cat_names_var = [c for c in feature_list if c in cat_feature_global]
    cat_indices_var = [feature_list.index(c) for c in cat_names_var]

    print("Категориальные фичи варианта:", cat_names_var)
    print("Индексы категориальных фичей:", cat_indices_var)

    oof_pred = np.zeros(len(df), dtype=float)
    fold_metrics = []

    fold_idx = 0
    for train_idx, valid_idx in folds:
        fold_idx += 1
        X_train, X_valid = X_var.iloc[train_idx], X_var.iloc[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        n_pos = int(y_train.sum())
        n_neg = int((1 - y_train).sum())
        if n_pos == 0:
            raise ValueError(f"В фолде {fold_idx} нет фрода — что-то не так со стратификацией.")

        scale_pos_weight = n_neg / n_pos

        if verbose_training:
            print(f"\n=== Fold {fold_idx}/{N_SPLITS} ===")
            print(f"  fold pos={n_pos}, neg={n_neg}, scale_pos_weight={scale_pos_weight:.2f}")

        train_pool = Pool(
            X_train,
            y_train,
            cat_features=cat_indices_var if cat_indices_var else None,
        )
        valid_pool = Pool(
            X_valid,
            y_valid,
            cat_features=cat_indices_var if cat_indices_var else None,
        )

        cat_params = {
            "loss_function": "Logloss",
            "eval_metric": "AUC",
            "iterations": 2000,
            "depth": 6,
            "learning_rate": 0.05,
            "l2_leaf_reg": 3.0,
            "random_seed": 42 + fold_idx,
            "border_count": 254,
            "scale_pos_weight": scale_pos_weight,
            "bootstrap_type": "Bayesian",
            "bagging_temperature": 0.5,
            "use_best_model": True,
            "early_stopping_rounds": 100,
            "task_type": "CPU",
            "verbose": 200 if verbose_training else False,
        }

        model_fold = CatBoostClassifier(**cat_params)
        model_fold.fit(train_pool, eval_set=valid_pool)

        y_valid_proba = model_fold.predict_proba(valid_pool)[:, 1]
        oof_pred[valid_idx] = y_valid_proba

        fold_roc = roc_auc_score(y_valid, y_valid_proba)
        fold_pr = average_precision_score(y_valid, y_valid_proba)

        if verbose_training:
            print(f"  Fold ROC-AUC={fold_roc:.4f}, PR-AUC={fold_pr:.4f}")

        fold_metrics.append({"fold": fold_idx, "roc_auc": fold_roc, "pr_auc": fold_pr})

    fold_metrics_df = pd.DataFrame(fold_metrics)
    print("\nCV-метрики по фолдам для варианта", name)
    print(fold_metrics_df)
    cv_roc_mean = fold_metrics_df["roc_auc"].mean()
    cv_pr_mean = fold_metrics_df["pr_auc"].mean()

    # OOF-метрики
    roc_oof = roc_auc_score(y, oof_pred)
    pr_oof = average_precision_score(y, oof_pred)
    baseline_pr = y.mean()

    print("\n=== OOF-метрики для варианта", name, "===")
    print("ROC-AUC (OOF):", round(roc_oof, 4))
    print("PR-AUC (OOF):", round(pr_oof, 4))
    print("Baseline PR-AUC (random):", round(baseline_pr, 4))

    # Поиск лучшего F1-порога
    thresholds = np.linspace(0.0, 1.0, 501)
    metrics_list = [eval_at_threshold(y, oof_pred, t) for t in thresholds]
    metrics_df = pd.DataFrame(metrics_list)

    valid_metrics = metrics_df[metrics_df["threshold"] > 0.0].copy()
    best_row = valid_metrics.loc[valid_metrics["f1"].idxmax()]

    print("\nЛучший F1 по OOF для варианта", name)
    print(
        "  threshold={:.3f}, precision={:.3f}, recall={:.3f}, f1={:.3f}".format(
            best_row["threshold"],
            best_row["precision"],
            best_row["recall"],
            best_row["f1"],
        )
    )

    summary = {
        "variant": name,
        "n_features": len(feature_list),
        "cv_roc_auc_mean": float(cv_roc_mean),
        "cv_pr_auc_mean": float(cv_pr_mean),
        "oof_roc_auc": float(roc_oof),
        "oof_pr_auc": float(pr_oof),
        "baseline_pr": float(baseline_pr),
        "best_f1_threshold": float(best_row["threshold"]),
        "best_f1_precision": float(best_row["precision"]),
        "best_f1_recall": float(best_row["recall"]),
        "best_f1": float(best_row["f1"]),
    }

    return summary


# =============================
# 7. Запуск абляции по всем вариантам
# =============================

all_results = []

for name, cols in variants:
    summary = run_variant(name, cols, verbose_training=True)
    all_results.append(summary)

results_df = pd.DataFrame(all_results)
results_df_sorted = results_df.sort_values("oof_pr_auc", ascending=False)

print("\n\n================ ИТОГОВАЯ СВОДКА ПО ВАРИАНТАМ (сортировка по OOF PR-AUC) ================")
display(results_df_sorted)

# При желании можно также сохранить в parquet/csv
RESULTS_PATH = PROC_PATH / "fast_gate_feature_ablation_v11.parquet"
results_df_sorted.to_parquet(RESULTS_PATH, index=False)
print("\nСводка по абляции сохранена в", RESULTS_PATH)


Пробуем прочитать как Parquet: data/processed/features_offline_v11.parquet
✅ Успешно прочитан как Parquet.
Всего строк: 13113
Колонок: 192
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstiness_login_interval', 'sess_fano_login_interval', 'sess_z_login_interval_7d', 'sess_has_login_history', 'sess_last_phone_model'] ...

Распределение target по всему датасету:
target
0    12948
1      165
Name: count, dtype: int64
Доля фрода: 0.012582932967284374

Всего фич (all_feature_cols): 185
Глобальные категориальные фичи: ['sess_last_phone_model', 'sess_last_os']

Размеры групп фич:
  base_cols: 33
  

Unnamed: 0,variant,n_features,cv_roc_auc_mean,cv_pr_auc_mean,oof_roc_auc,oof_pr_auc,baseline_pr,best_f1_threshold,best_f1_precision,best_f1_recall,best_f1
5,F_full_without_node2vec,57,0.921481,0.482677,0.856438,0.402306,0.012583,0.74,0.601942,0.375758,0.462687
1,B_base+graph,39,0.912182,0.416345,0.880496,0.389857,0.012583,0.846,0.556818,0.29697,0.387352
3,D_full_with_session,185,0.908123,0.414177,0.81833,0.324677,0.012583,0.618,0.410853,0.321212,0.360544
4,E_full_without_session,167,0.900199,0.383819,0.827286,0.324272,0.012583,0.72,0.608696,0.254545,0.358974
2,C_base+graph+node2vec,167,0.900199,0.383819,0.827286,0.324272,0.012583,0.72,0.608696,0.254545,0.358974
6,G_full_without_graph,179,0.885841,0.336183,0.849655,0.318413,0.012583,0.73,0.633803,0.272727,0.381356
0,A_base,33,0.868892,0.160739,0.813932,0.123602,0.012583,0.672,0.132159,0.363636,0.193861



Сводка по абляции сохранена в data/processed/fast_gate_feature_ablation_v11.parquet


In [None]:
# ============================================
# 01_train_fast_gate_v11.ipynb
# Fast Gate v11 (вариант F_full_without_node2vec)
#  - фичи: base + graph + session
#  - без emb_cst_*, emb_dir_*
#  - 5-fold Stratified OOF + time-based holdout
#  - сохранение модели, схемы фич и порогов стратегий
# ============================================


!pip install catboost

import os
import json
import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve
)

# -----------------------------
# Конфиг путей и константы
# -----------------------------
RANDOM_STATE = 42
N_FOLDS = 5

FEATURES_PARQUET = "data/processed/features_offline_v11.parquet"

MODEL_PATH = "models/catboost_fast_gate_v11.cbm"
FEATURES_SCHEMA_PATH = "config/features_schema_v11.json"
STRATEGY_THRESHOLDS_PATH = "config/strategy_thresholds_v11.json"

os.makedirs("models", exist_ok=True)
os.makedirs("config", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

# -----------------------------
# 1. Загрузка фич v11
# -----------------------------
print(f"Пробуем прочитать Parquet: {FEATURES_PARQUET}")
df = pd.read_parquet(FEATURES_PARQUET)
print("✅ Успешно прочитан как Parquet.")

print(f"Всего строк: {len(df)}")
print(f"Колонок: {df.shape[1]}")
print("Первые колонки:", df.columns[:20].tolist(), "...")

# Убедимся, что target бинарный int
target_col = "target"
df[target_col] = df[target_col].astype(int)

print("\nРаспределение target по всему датасету:")
print(df[target_col].value_counts())
print("Доля фрода:", df[target_col].mean())

# -----------------------------
# 2. Определение групп фич (как в абляции)
# -----------------------------
# Служебные столбцы, которые НЕ идут в модель
service_cols = [
    "transdatetime",
    "cst_dim_id",
    "transdate",
    "docno",
    "direction",
    "row_id"
]

# Сессионные фичи (из поведенческого датасета, колоноки sess_*)
sess_cols = [c for c in df.columns if c.startswith("sess_")]

# Графовые фичи (ручной список)
graph_cols = [
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
]

# Эмбеддинги Node2Vec (их мы УБИРАЕМ из Fast Gate v11)
emb_cst_cols = [c for c in df.columns if c.startswith("emb_cst_")]
emb_dir_cols = [c for c in df.columns if c.startswith("emb_dir_")]

# Base-фичи: всё остальное, кроме служебных, target, sess_, graph, emb_*
exclude_cols = (
    set(service_cols)
    | {target_col}
    | set(sess_cols)
    | set(graph_cols)
    | set(emb_cst_cols)
    | set(emb_dir_cols)
)
base_cols = [c for c in df.columns if c not in exclude_cols]

print("\nРазмеры групп фич:")
print("  base_cols:", len(base_cols))
print("  graph_cols:", len(graph_cols))
print("  sess_cols:", len(sess_cols))
print("  emb_cst_cols:", len(emb_cst_cols))
print("  emb_dir_cols:", len(emb_dir_cols))

# -----------------------------
# 3. Вариант F_full_without_node2vec
#    = base + graph + session
# -----------------------------
feature_cols = base_cols + graph_cols + sess_cols

print("\nВариант F_full_without_node2vec:")
print(f"  Число фич: {len(feature_cols)}")

# Категориальные фичи: сессионные last_os и last_phone_model (если есть)
categorical_feature_names = []
for col in ["sess_last_os", "sess_last_phone_model"]:
    if col in feature_cols:
        categorical_feature_names.append(col)

cat_feature_indices = [
    feature_cols.index(c) for c in categorical_feature_names
]

print("Категориальные фичи:", categorical_feature_names)
print("Индексы категориальных фичей:", cat_feature_indices)

# -----------------------------
# 4. Подготовка X, y
# -----------------------------
X = df[feature_cols].copy()
y = df[target_col].values

print("\nРазмер X:", X.shape)
print("Классы y:", np.bincount(y))

# -----------------------------
# 5. Функции для обучения и метрик
# -----------------------------
def train_oof_catboost(
    X,
    y,
    feature_cols,
    cat_indices,
    n_folds=5,
    random_state=42
):
    """
    Обучение CatBoost с 5-fold Stratified OOF.
    Возвращает:
      - список моделей по фолдам
      - oof-прогнозы
      - метрики по фолдам
      - OOF ROC-AUC и PR-AUC
    """
    skf = StratifiedKFold(
        n_splits=n_folds,
        shuffle=True,
        random_state=random_state
    )

    oof_pred = np.zeros(len(X), dtype=float)
    models = []
    fold_metrics = []

    print("\n================ OOF-обучение Fast Gate v11 (F_full_without_node2vec) ================")

    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n=== Fold {fold}/{n_folds} ===")
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y[tr_idx], y[val_idx]

        pos = (y_tr == 1).sum()
        neg = (y_tr == 0).sum()
        scale_pos_weight = neg / max(pos, 1)

        print(f"  fold pos={pos}, neg={neg}, scale_pos_weight={scale_pos_weight:.2f}")

        train_pool = Pool(
            X_tr,
            label=y_tr,
            cat_features=cat_indices if cat_indices else None
        )
        valid_pool = Pool(
            X_val,
            label=y_val,
            cat_features=cat_indices if cat_indices else None
        )

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            learning_rate=0.05,
            depth=6,
            l2_leaf_reg=5,
            iterations=2000,
            random_seed=random_state + fold,
            scale_pos_weight=scale_pos_weight,
            border_count=254,
            task_type="CPU",
            verbose=200,
            od_type="Iter",
            od_wait=100
        )

        model.fit(
            train_pool,
            eval_set=valid_pool,
            use_best_model=True
        )

        val_pred = model.predict_proba(valid_pool)[:, 1]
        oof_pred[val_idx] = val_pred

        roc = roc_auc_score(y_val, val_pred)
        pr = average_precision_score(y_val, val_pred)
        print(f"  Fold ROC-AUC={roc:.4f}, PR-AUC={pr:.4f}")

        models.append(model)
        fold_metrics.append({"fold": fold, "roc_auc": roc, "pr_auc": pr})

    roc_oof = roc_auc_score(y, oof_pred)
    pr_oof = average_precision_score(y, oof_pred)

    print("\n=== OOF-метрики (по всему датасету) ===")
    print(f"ROC-AUC (OOF): {roc_oof:.4f}")
    print(f"PR-AUC  (OOF): {pr_oof:.4f}")
    baseline_pr = y.mean()
    print(f"Baseline PR-AUC (random): {baseline_pr:.4f}")

    return models, oof_pred, fold_metrics, roc_oof, pr_oof


def compute_threshold_table(y_true, y_score, n_points=501):
    """
    Считаем precision/recall/F1 по сетке порогов [0..1].
    """
    thresholds = np.linspace(0, 1, n_points)
    rows = []

    for t in thresholds:
        y_pred = (y_score >= t).astype(int)
        tp = ((y_true == 1) & (y_pred == 1)).sum()
        fp = ((y_true == 0) & (y_pred == 1)).sum()
        fn = ((y_true == 1) & (y_pred == 0)).sum()

        if tp + fp == 0:
            precision = 0.0
        else:
            precision = tp / (tp + fp)

        if tp + fn == 0:
            recall = 0.0
        else:
            recall = tp / (tp + fn)

        if precision + recall == 0:
            f1 = 0.0
        else:
            f1 = 2 * precision * recall / (precision + recall)

        rows.append({
            "threshold": t,
            "precision": precision,
            "recall": recall,
            "f1": f1
        })

    df_th = pd.DataFrame(rows)
    return df_th


def select_strategy_thresholds(df_th, y_true, y_score):
    """
    Выбор порогов для стратегий Aggressive / Balanced / Friendly.
    - Aggressive: минимальный t с recall >= 0.95
    - Balanced: t с максимальным F1
    - Friendly: максимальный t с precision >= 0.98
    """
    # Aggressive
    df_aggr = df_th[df_th["recall"] >= 0.95]
    if len(df_aggr) > 0:
        row_aggr = df_aggr.iloc[0]
    else:
        row_aggr = df_th.iloc[0]

    # Balanced
    idx_best = df_th["f1"].idxmax()
    row_bal = df_th.loc[idx_best]

    # Friendly
    df_friendly = df_th[df_th["precision"] >= 0.98]
    if len(df_friendly) > 0:
        row_friend = df_friendly.iloc[-1]
    else:
        row_friend = df_th.iloc[-1]

    strategies = {
        "Aggressive": row_aggr,
        "Balanced": row_bal,
        "Friendly": row_friend
    }

    print("\n=== Пороги стратегий по OOF ===")
    for name, row in strategies.items():
        print(
            f"{name}: threshold={row['threshold']:.3f}, "
            f"precision={row['precision']:.3f}, "
            f"recall={row['recall']:.3f}, "
            f"f1={row['f1']:.3f}"
        )

    return strategies


# -----------------------------
# 6. OOF-обучение CatBoost (Fast Gate v11, F)
# -----------------------------
models_cv, oof_pred, fold_metrics, roc_oof, pr_oof = train_oof_catboost(
    X=X,
    y=y,
    feature_cols=feature_cols,
    cat_indices=cat_feature_indices,
    n_folds=N_FOLDS,
    random_state=RANDOM_STATE
)

# Таблица порогов и выбор стратегий
df_th = compute_threshold_table(y_true=y, y_score=oof_pred, n_points=501)
best_idx = df_th["f1"].idxmax()
best_row = df_th.loc[best_idx]
print("\nЛучший F1 по OOF:")
print(
    f"  threshold={best_row['threshold']:.3f}, "
    f"precision={best_row['precision']:.3f}, "
    f"recall={best_row['recall']:.3f}, "
    f"f1={best_row['f1']:.3f}"
)

strategy_rows = select_strategy_thresholds(df_th, y_true=y, y_score=oof_pred)

# -----------------------------
# 7. Обучение финальной модели на всём датасете
# -----------------------------
pos_full = (y == 1).sum()
neg_full = (y == 0).sum()
scale_pos_weight_full = neg_full / max(pos_full, 1)

print("\nПолный датасет:")
print(f"  positives: {pos_full} negatives: {neg_full}")
print(f"  scale_pos_weight (full): {scale_pos_weight_full:.2f}")

# Возьмём среднее количество деревьев по фолдам как ориентир
best_tree_counts = [m.tree_count_ for m in models_cv]
avg_trees = int(np.round(np.mean(best_tree_counts)))
print(f"Среднее число деревьев по фолдам: {avg_trees}")

final_iterations = max(avg_trees, 300)  # чтобы было достаточно итераций

full_pool = Pool(
    X,
    label=y,
    cat_features=cat_feature_indices if cat_feature_indices else None
)

print("\n=== Обучение финальной модели Fast Gate v11 на всём датасете ===")
model_final = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5,
    iterations=final_iterations,
    random_seed=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight_full,
    border_count=254,
    task_type="CPU",
    verbose=200
)

model_final.fit(full_pool)

# Сохраняем модель
print(f"\nСохранение модели CatBoost Fast Gate v11 в {MODEL_PATH}")
model_final.save_model(MODEL_PATH)

# -----------------------------
# 8. Схема фич для backend (features_schema_v11.json)
# -----------------------------
features_schema = {
    "version": "fast_gate_v11_F_full_without_node2vec",
    "n_features": len(feature_cols),
    "feature_names": feature_cols,
    "cat_feature_names": categorical_feature_names,
    "cat_feature_indices": cat_feature_indices,
    "features": []
}

for col in feature_cols:
    col_dtype = str(df[col].dtype)
    role = "categorical" if col in categorical_feature_names else "numeric"
    features_schema["features"].append(
        {
            "name": col,
            "dtype": col_dtype,
            "role": role
        }
    )

print(f"\nСхема фичей сохранена в {FEATURES_SCHEMA_PATH}")
with open(FEATURES_SCHEMA_PATH, "w", encoding="utf-8") as f:
    json.dump(features_schema, f, ensure_ascii=False, indent=2)

# -----------------------------
# 9. Пороги стратегий (strategy_thresholds_v11.json)
# -----------------------------
strategy_thresholds = {
    "version": "fast_gate_v11_F_full_without_node2vec",
    "metric": "risk_score_fast_gate_v11",
    "oof_metrics": {
        "roc_auc": float(roc_oof),
        "pr_auc": float(pr_oof),
        "baseline_pr": float(y.mean())
    },
    "strategies": {}
}

for name, row in strategy_rows.items():
    strategy_thresholds["strategies"][name] = {
        "threshold": float(row["threshold"]),
        "description": {
            "Aggressive": "Максимальный recall, готовы терпеть больше ложных срабатываний.",
            "Balanced": "Компромисс между precision и recall (максимум F1 по OOF).",
            "Friendly": "Максимальный precision, минимизация FP, но меньше найденного фрода."
        }[name],
        "expected_metrics_oof": {
            "precision": float(row["precision"]),
            "recall": float(row["recall"]),
            "f1": float(row["f1"])
        }
    }

print(f"\nПороги стратегий сохранены в {STRATEGY_THRESHOLDS_PATH}")
with open(STRATEGY_THRESHOLDS_PATH, "w", encoding="utf-8") as f:
    json.dump(strategy_thresholds, f, ensure_ascii=False, indent=2)

# -----------------------------
# 10. Time-based holdout (как в v2) для "космических" метрик
# -----------------------------
print("\n================ Time-based holdout evaluation (demo) ================")
df_sorted = df.sort_values("transdatetime").reset_index(drop=True)

split_idx = int(len(df_sorted) * 0.8)
train_mask = np.zeros(len(df_sorted), dtype=bool)
train_mask[:split_idx] = True

X_tb_train = df_sorted.loc[train_mask, feature_cols]
y_tb_train = df_sorted.loc[train_mask, target_col].values

X_tb_val = df_sorted.loc[~train_mask, feature_cols]
y_tb_val = df_sorted.loc[~train_mask, target_col].values

pos_tb = (y_tb_train == 1).sum()
neg_tb = (y_tb_train == 0).sum()
scale_pos_weight_tb = neg_tb / max(pos_tb, 1)

print(f"Сплит по времени:")
print(f"  split_idx: {split_idx}")
print(f"  train size: {len(X_tb_train)} valid size: {len(X_tb_val)}")
print("train target dist:")
print(pd.Series(y_tb_train).value_counts())
print("valid target dist:")
print(pd.Series(y_tb_val).value_counts())

pool_tb_train = Pool(
    X_tb_train,
    label=y_tb_train,
    cat_features=cat_feature_indices if cat_feature_indices else None
)
pool_tb_val = Pool(
    X_tb_val,
    label=y_tb_val,
    cat_features=cat_feature_indices if cat_feature_indices else None
)

model_tb = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5,
    iterations=2000,
    random_seed=RANDOM_STATE,
    scale_pos_weight=scale_pos_weight_tb,
    border_count=254,
    task_type="CPU",
    verbose=200,
    od_type="Iter",
    od_wait=100
)

print("\n=== Обучение модели на раннем отрезке и валидация на хвосте (time-based) ===")
model_tb.fit(
    pool_tb_train,
    eval_set=pool_tb_val,
    use_best_model=True
)

val_pred_tb = model_tb.predict_proba(pool_tb_val)[:, 1]
roc_tb = roc_auc_score(y_tb_val, val_pred_tb)
pr_tb = average_precision_score(y_tb_val, val_pred_tb)

print("\n=== Метрики time-based holdout (демонстрационные) ===")
print(f"ROC-AUC (time-based): {roc_tb:.4f}")
print(f"PR-AUC  (time-based): {pr_tb:.4f}")
print("Baseline PR-AUC (random): {:.4f}".format(y_tb_val.mean()))

df_th_tb = compute_threshold_table(y_true=y_tb_val, y_score=val_pred_tb, n_points=501)
idx_best_tb = df_th_tb["f1"].idxmax()
row_best_tb = df_th_tb.loc[idx_best_tb]
print(
    "Лучший F1 (time-based): "
    f"threshold={row_best_tb['threshold']:.3f}, "
    f"precision={row_best_tb['precision']:.3f}, "
    f"recall={row_best_tb['recall']:.3f}, "
    f"f1={row_best_tb['f1']:.3f}"
)

print("\n✅ Fast Gate v11 (F_full_without_node2vec) обучен, модель и конфиги сохранены.")


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Пробуем прочитать Parquet: data/processed/features_offline_v11.parquet
✅ Успешно прочитан как Parquet.
Всего строк: 13113
Колонок: 192
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d'] ...

Распределение target по всему датасету:
