In [None]:
import os
import json
import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import List, Dict, Tuple

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_fscore_support

import warnings
warnings.filterwarnings("ignore")

try:
    from catboost import CatBoostClassifier, Pool
except ImportError:
    !pip install -q catboost
    from catboost import CatBoostClassifier, Pool

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_PARQUET = "data/processed/features_offline_v11.parquet"
MODEL_PATH = "models/graph_brain_v11.cbm"
FEATURES_SCHEMA_PATH = "config/graph_brain_features_v11.json"
THRESHOLDS_PATH = "config/graph_brain_thresholds_v11.json"
ABLATION_METRICS_PATH = "data/processed/graph_brain_ablation_v11.parquet"
FI_PATH = "data/processed/graph_brain_feature_importance_v11.parquet"

os.makedirs("models", exist_ok=True)
os.makedirs("config", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print("DEVICE: cpu (Graph Brain v11)")


print(f"Пробуем прочитать Parquet: {DATA_PARQUET}")
df = pd.read_parquet(DATA_PARQUET)
print("Успешно прочитан как Parquet.")
print("Всего строк:", len(df))
print("Колонок:", df.shape[1])
print("Первые колонки:", df.columns[:25].tolist(), "...")

target_col = "target"
assert target_col in df.columns, "Ожидаем колонку target в feature store"

print("\nРаспределение target по всему датасету:")
print(df[target_col].value_counts())
fraud_share = df[target_col].mean()
print("Доля фрода:", fraud_share)


all_cols = df.columns.tolist()

graph_core_candidates = [
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
    "user_new_dirs_60m",
    "dir_tx_60m",
    "dir_unique_senders_60m",
]

graph_simple_cols = [c for c in graph_core_candidates if c in all_cols]

emb_cst_cols = sorted([c for c in all_cols if c.startswith("emb_cst_")])
emb_dir_cols = sorted([c for c in all_cols if c.startswith("emb_dir_")])

print("Размеры групп фич:")
print("  graph_simple_cols:", len(graph_simple_cols))
print("  emb_cst_cols:", len(emb_cst_cols))
print("  emb_dir_cols:", len(emb_dir_cols))

feature_sets: Dict[str, List[str]] = {
    "G1_graph_simple": graph_simple_cols,
    "G2_graph_simple+emb": graph_simple_cols + emb_cst_cols + emb_dir_cols,
    "G3_emb_only": emb_cst_cols + emb_dir_cols,
}

feature_sets = {k: v for k, v in feature_sets.items() if len(v) > 0}

print("\nВарианты фич для Graph Brain:")
for name, cols in feature_sets.items():
    print(f"  {name}: {len(cols)} фич")


def compute_pr_table(
    y_true: np.ndarray,
    y_score: np.ndarray,
    n_thresholds: int = 101,
) -> pd.DataFrame:
    """
    Строим табличку precision/recall/F1 по сетке порогов.
    """
    thresholds = np.linspace(0.0, 1.0, n_thresholds)
    rows = []
    for thr in thresholds:
        y_pred = (y_score >= thr).astype(int)
        if y_pred.sum() == 0:
            prec = 0.0
            rec = 0.0
            f1 = 0.0
        else:
            prec, rec, f1, _ = precision_recall_fscore_support(
                y_true, y_pred, average="binary", zero_division=0
            )
        rows.append(
            {
                "threshold": thr,
                "precision": prec,
                "recall": rec,
                "f1": f1,
            }
        )
    df_thr = pd.DataFrame(rows)
    return df_thr


def pick_strategy_thresholds(
    y_true: np.ndarray,
    y_score: np.ndarray,
) -> Dict[str, Dict[str, float]]:
    """
    Выбираем пороги для трёх стратегий:
    - aggressive: максимальный recall (~ловим почти всё), даже ценой precision.
    - balanced: максимум F1.
    - friendly: максимум precision, при этом recall > 0.
    """
    tbl = compute_pr_table(y_true, y_score, n_thresholds=401)

    # Balanced
    idx_bal = tbl["f1"].idxmax()
    row_bal = tbl.loc[idx_bal]

    # Aggressive
    tbl_sorted = tbl.sort_values("threshold")
    cand_aggr = tbl_sorted[tbl_sorted["recall"] >= 0.95]
    if len(cand_aggr) > 0:
        row_aggr = cand_aggr.iloc[0]
    else:
        l
        idx_aggr = tbl["recall"].idxmax()
        row_aggr = tbl.loc[idx_aggr]

    # Friendly
    cand_friendly = tbl[tbl["recall"] > 0]
    if len(cand_friendly) > 0:
        idx_fr = cand_friendly["precision"].idxmax()
        row_fr = cand_friendly.loc[idx_fr]
    else:
        row_fr = row_bal

    thresholds = {
        "aggressive": {
            "threshold": float(row_aggr["threshold"]),
            "precision": float(row_aggr["precision"]),
            "recall": float(row_aggr["recall"]),
            "f1": float(row_aggr["f1"]),
        },
        "balanced": {
            "threshold": float(row_bal["threshold"]),
            "precision": float(row_bal["precision"]),
            "recall": float(row_bal["recall"]),
            "f1": float(row_bal["f1"]),
        },
        "friendly": {
            "threshold": float(row_fr["threshold"]),
            "precision": float(row_fr["precision"]),
            "recall": float(row_fr["recall"]),
            "f1": float(row_fr["f1"]),
        },
    }
    return thresholds


@dataclass
class VariantResult:
    name: str
    n_features: int
    oof_roc_auc: float
    oof_pr_auc: float
    best_f1_threshold: float
    best_f1_precision: float
    best_f1_recall: float
    best_f1: float



X_full = df  
y = df[target_col].values

n_samples = len(df)
print(f"Размер X_full: {X_full.shape}, целевая переменная: {np.bincount(y)}")

skf = StratifiedKFold(
    n_splits=5, shuffle=True, random_state=RANDOM_STATE
)

variant_results: List[VariantResult] = []
variant_oof_scores: Dict[str, np.ndarray] = {}
variant_best_models_per_fold: Dict[str, List[CatBoostClassifier]] = {}

for variant_name, cols in feature_sets.items():
    print("\n" + "=" * 80)
    print(f"Вариант Graph Brain: {variant_name}")
    print(f"Число фич: {len(cols)}")

    X = X_full[cols].copy().fillna(0.0)
    X_values = X.values

    oof_pred = np.zeros(n_samples, dtype=float)
    fold_models: List[CatBoostClassifier] = []

    fold_metrics = []

    for fold, (train_idx, valid_idx) in enumerate(skf.split(X_values, y), start=1):
        X_train, X_valid = X_values[train_idx], X_values[valid_idx]
        y_train, y_valid = y[train_idx], y[valid_idx]

        pos_train = (y_train == 1).sum()
        neg_train = (y_train == 0).sum()
        scale_pos_weight = neg_train / max(pos_train, 1)
        print(f"\n=== Fold {fold}/5 ({variant_name}) ===")
        print(f"  fold pos={pos_train}, neg={neg_train}, scale_pos_weight={scale_pos_weight:.2f}")

        train_pool = Pool(X_train, y_train)
        valid_pool = Pool(X_valid, y_valid)

        model = CatBoostClassifier(
            loss_function="Logloss",
            eval_metric="AUC",
            iterations=1200,
            learning_rate=0.03,
            depth=6,
            l2_leaf_reg=5.0,
            random_seed=RANDOM_STATE + fold,
            bootstrap_type="Bayesian",
            bagging_temperature=0.25,
            border_count=128,
            class_weights=[1.0, scale_pos_weight],
            od_type="Iter",
            od_wait=100,
            thread_count=-1,
            verbose=100,
        )

        model.fit(
            train_pool,
            eval_set=valid_pool,
            use_best_model=True,
            verbose=100,
        )

        pred_valid = model.predict_proba(valid_pool)[:, 1]
        oof_pred[valid_idx] = pred_valid
        fold_models.append(model)

        roc = roc_auc_score(y_valid, pred_valid)
        pr = average_precision_score(y_valid, pred_valid)
        fold_metrics.append((roc, pr))

        print(f"  Fold ROC-AUC={roc:.4f}, PR-AUC={pr:.4f}")

    oof_roc = roc_auc_score(y, oof_pred)
    oof_pr = average_precision_score(y, oof_pred)
    baseline_pr = y.mean()
    print(f"\n=== OOF-метрики для варианта {variant_name} ===")
    print(f"ROC-AUC (OOF): {oof_roc:.4f}")
    print(f"PR-AUC  (OOF): {oof_pr:.4f}")
    print(f"Baseline PR-AUC (random): {baseline_pr:.6f}")

    thr_tbl = compute_pr_table(y, oof_pred, n_thresholds=401)
    idx_best_f1 = thr_tbl["f1"].idxmax()
    row_best = thr_tbl.loc[idx_best_f1]
    best_thr = float(row_best["threshold"])
    best_prec = float(row_best["precision"])
    best_rec = float(row_best["recall"])
    best_f1 = float(row_best["f1"])
    print("\nЛучший F1 по OOF для варианта", variant_name)
    print(
        f"  threshold={best_thr:.3f}, "
        f"precision={best_prec:.3f}, recall={best_rec:.3f}, f1={best_f1:.3f}"
    )

    variant_results.append(
        VariantResult(
            name=variant_name,
            n_features=len(cols),
            oof_roc_auc=oof_roc,
            oof_pr_auc=oof_pr,
            best_f1_threshold=best_thr,
            best_f1_precision=best_prec,
            best_f1_recall=best_rec,
            best_f1=best_f1,
        )
    )
    variant_oof_scores[variant_name] = oof_pred
    variant_best_models_per_fold[variant_name] = fold_models


res_df = pd.DataFrame([vars(r) for r in variant_results])
res_df = res_df.sort_values("oof_pr_auc", ascending=False).reset_index(drop=True)
print("\n================ ИТОГОВАЯ СВОДКА ПО GRAPH-вариантам (сортировка по OOF PR-AUC) ================")
print(res_df)

res_df.to_parquet(ABLATION_METRICS_PATH, index=False)
print(f"\nOOF-метрики по вариантам сохранены в {ABLATION_METRICS_PATH}")

best_variant_name = res_df.iloc[0]["name"]
best_oof_pr = res_df.iloc[0]["oof_pr_auc"]
best_oof_roc = res_df.iloc[0]["oof_roc_auc"]
print(f"\nЛучший вариант Graph Brain: {best_variant_name}")
print(f"  OOF PR-AUC: {best_oof_pr:.4f}")
print(f"  OOF ROC-AUC: {best_oof_roc:.4f}")

best_feature_cols = feature_sets[best_variant_name]
best_oof_pred = variant_oof_scores[best_variant_name]


graph_thresholds = pick_strategy_thresholds(y, best_oof_pred)

print("\n=== Пороги стратегий по OOF для Graph Brain v11 ===")
for name, info in graph_thresholds.items():
    print(
        f"{name.capitalize()}: threshold={info['threshold']:.3f}, "
        f"precision={info['precision']:.3f}, recall={info['recall']:.3f}, f1={info['f1']:.3f}"
    )


X_best = X_full[best_feature_cols].copy().fillna(0.0)
X_best_values = X_best.values

pos_full = int((y == 1).sum())
neg_full = int((y == 0).sum())
scale_pos_weight_full = neg_full / max(pos_full, 1)

print("\nПолный датасет для Graph Brain:")
print(f"  positives: {pos_full} negatives: {neg_full}")
print(f"  scale_pos_weight (full): {scale_pos_weight_full:.2f}")
print(f"  Число фич лучшего варианта ({best_variant_name}): {len(best_feature_cols)}")

full_pool = Pool(X_best_values, y)

final_model = CatBoostClassifier(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations= int(1.2 * 300),  
    learning_rate=0.03,
    depth=6,
    l2_leaf_reg=5.0,
    random_seed=RANDOM_STATE,
    bootstrap_type="Bayesian",
    bagging_temperature=0.25,
    border_count=128,
    class_weights=[1.0, scale_pos_weight_full],
    od_type="Iter",
    od_wait=100,
    thread_count=-1,
    verbose=100,
)

final_model.fit(full_pool, verbose=100)
print("\nФинальная Graph Brain модель обучена.")


final_model.save_model(MODEL_PATH)
print(f"Модель Graph Brain v11 сохранена в {MODEL_PATH}")

schema = {
    "version": "v11",
    "best_variant": best_variant_name,
    "feature_cols": best_feature_cols,
    "n_features": len(best_feature_cols),
    "target_col": target_col,
}
with open(FEATURES_SCHEMA_PATH, "w", encoding="utf-8") as f:
    json.dump(schema, f, ensure_ascii=False, indent=2)
print(f"Схема фич Graph Brain сохранена в {FEATURES_SCHEMA_PATH}")

thresholds_payload = {
    "version": "v11",
    "best_variant": best_variant_name,
    "oof_roc_auc": float(best_oof_roc),
    "oof_pr_auc": float(best_oof_pr),
    "thresholds": graph_thresholds,
}
with open(THRESHOLDS_PATH, "w", encoding="utf-8") as f:
    json.dump(thresholds_payload, f, ensure_ascii=False, indent=2)
print(f"Пороги Graph Brain сохранены в {THRESHOLDS_PATH}")


fi = final_model.get_feature_importance(full_pool, type="FeatureImportance")
fi_df = pd.DataFrame(
    {
        "feature": best_feature_cols,
        "importance": fi,
    }
).sort_values("importance", ascending=False).reset_index(drop=True)

print("\nТОП-20 фич Graph Brain v11:")
print(fi_df.head(20))

fi_df.to_parquet(FI_PATH, index=False)
print(f"Важности фич Graph Brain сохранены в {FI_PATH}")


full_pred = final_model.predict_proba(full_pool)[:, 1]

df["risk_graph_oof_v11"] = best_oof_pred
df["risk_graph_score_v11"] = full_pred

df.to_parquet(DATA_PARQUET, index=False)
print(f"\nОбновлённый feature store с Graph Brain v11 сохранён в {DATA_PARQUET}")

print("\nГотово: Graph Brain v11 построен и интегрирован.")
print(" - Лучшая конфигурация фич:", best_variant_name)
print(" - OOF ROC-AUC:", round(best_oof_roc, 4))
print(" - OOF PR-AUC:", round(best_oof_pr, 4))

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hDEVICE: cpu (Graph Brain v11)
Пробуем прочитать Parquet: data/processed/features_offline_v11.parquet
✅ Успешно прочитан как Parquet.
Всего строк: 13113
Колонок: 198
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstiness_login_interval', 'sess_fano_login_interval', 'sess_z_login_interval_7d', 'sess_has_login_history', 'sess_last_phone_model'] ...

Распределение target по всему датасету:
target
0    12948
1      165
Name: count, dtype: int64
Доля фрода: 0.0125829329672843

In [None]:
# 03_graph_brain_v11.ipynb

# ============================================
# Graph Brain v11 — graph + velocity + user agg
# ============================================


!pip install catboost


import os
import json
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_curve,
)

from catboost import CatBoostClassifier, Pool

# ---------------------------
# 0. Общие настройки
# ---------------------------

FEATURES_PARQUET = "data/processed/features_offline_v11.parquet"
GRAPH_MODEL_PATH = "models/graph_brain_v11.cbm"
GRAPH_FEATURES_CONFIG = "config/graph_brain_features_v11.json"
GRAPH_THRESHOLDS_CONFIG = "config/graph_brain_thresholds_v11.json"

os.makedirs("models", exist_ok=True)
os.makedirs("config", exist_ok=True)

RANDOM_STATE = 42
N_FOLDS = 5

print("DEVICE: cpu (Graph Brain v11)")

# ---------------------------
# 1. Загрузка feature store
# ---------------------------

print(f"Пробуем прочитать Parquet: {FEATURES_PARQUET}")
df = pd.read_parquet(FEATURES_PARQUET)
print("✅ Успешно прочитан как Parquet.")

print(f"Всего строк: {len(df)}")
print(f"Колонок: {df.shape[1]}")
print("Первые колонки:", list(df.columns[:25]), "...")

# target и базовая статистика
target_col = "target"
y = df[target_col].values

print("\nРаспределение target по всему датасету:")
print(df[target_col].value_counts())
print("Доля фрода:", df[target_col].mean())

# ---------------------------
# 2. Фичи для Graph Brain v11
#    (base + graph, без sess и без node2vec/AE/meta)
# ---------------------------

# Числовые колонки
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

# Явно исключаемое
id_cols = [
    target_col,
    "row_id",
]

# meta/brain предикты, которые не должны идти в Graph Brain
meta_prefixes = (
    "fast_",          # fast_gate_proba / fast_gate_oof и т.п.
    "risk_",          # risk_ae, risk_meta и прочие risk_*
    "ae_",            # ae_*
    "autoencoder_",   # на всякий
    "graph_brain_",   # старые версии Graph Brain
    "seq_",           # sequence brain, если будет
    "meta_",          # meta-модель, если будет
)

meta_cols = [c for c in numeric_cols if c.startswith(meta_prefixes)]

# node2vec-эмбеддинги
emb_cst_cols = [c for c in numeric_cols if c.startswith("emb_cst_")]
emb_dir_cols = [c for c in numeric_cols if c.startswith("emb_dir_")]

# сессионные фичи
sess_cols = [c for c in numeric_cols if c.startswith("sess_")]

# всё, что может быть кандидатом для Graph Brain
candidate_cols = [
    c for c in numeric_cols
    if c not in id_cols
    and c not in meta_cols
]

# "простые" графовые фичи (ключевые)
graph_simple_list = [
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
]

graph_simple_cols = [c for c in candidate_cols if c in graph_simple_list]

# финальных emb и sess в Graph Brain нет
# base_cols = всё остальное, кроме emb и sess и явных graph_simple
base_cols = [
    c for c in candidate_cols
    if c not in emb_cst_cols
    and c not in emb_dir_cols
    and c not in sess_cols
    and c not in graph_simple_cols
]

graph_brain_feature_cols = sorted(base_cols + graph_simple_cols)

print("\nРазмеры групп фич:")
print(f"  base_cols: {len(base_cols)}")
print(f"  graph_simple_cols: {len(graph_simple_cols)}")
print(f"  emb_cst_cols (исключены): {len(emb_cst_cols)}")
print(f"  emb_dir_cols (исключены): {len(emb_dir_cols)}")
print(f"  sess_cols (исключены): {len(sess_cols)}")

print("\nФинальные фичи для Graph Brain v11 (пример):")
print(graph_brain_feature_cols[:40], "...")
print(f"Всего фич для Graph Brain v11: {len(graph_brain_feature_cols)}")

# Сохраняем схему фич
graph_features_cfg = {
    "version": "v11",
    "description": "Graph Brain v11 — base + graph_simple, без sess и node2vec/meta",
    "graph_brain_feature_cols": graph_brain_feature_cols,
    "groups": {
        "base_cols": base_cols,
        "graph_simple_cols": graph_simple_cols,
        "excluded": {
            "emb_cst_cols": emb_cst_cols,
            "emb_dir_cols": emb_dir_cols,
            "sess_cols": sess_cols,
            "meta_cols": meta_cols,
            "id_cols": id_cols,
        },
    },
}

with open(GRAPH_FEATURES_CONFIG, "w", encoding="utf-8") as f:
    json.dump(graph_features_cfg, f, ensure_ascii=False, indent=2)

print(f"\nСхема фич Graph Brain сохранена в {GRAPH_FEATURES_CONFIG}")

X = df[graph_brain_feature_cols].values
print("\nРазмер X:", X.shape)
print("Классы y:", np.bincount(y))

# ---------------------------
# 3. Вспомогательные функции
# ---------------------------

def compute_oof_metrics(y_true, y_pred, prefix=""):
    roc = roc_auc_score(y_true, y_pred)
    pr = average_precision_score(y_true, y_pred)
    print(f"{prefix}ROC-AUC: {roc:.4f}")
    print(f"{prefix}PR-AUC : {pr:.4f}")
    print(f"{prefix}Baseline PR-AUC (random): {y_true.mean():.4f}")
    return roc, pr

def find_best_f1_threshold(y_true, y_pred):
    prec, rec, thr = precision_recall_curve(y_true, y_pred)
    best_f1 = -1.0
    best = (None, None, None)
    for p, r, t in zip(prec, rec, np.append(thr, 1.0)):
        if p + r == 0:
            continue
        f1 = 2 * p * r / (p + r)
        if f1 > best_f1:
            best_f1 = f1
            best = (t, p, r)
    t, p, r = best
    print(f"\nЛучший F1 по OOF:")
    print(f"  threshold={t:.3f}, precision={p:.3f}, recall={r:.3f}, f1={best_f1:.3f}")
    return t, p, r, best_f1

def pick_strategy_thresholds(y_true, y_pred):
    """
    Подбор трёх порогов:
      - Aggressive: макс. recall при терпимой precision
      - Balanced : глобальный best F1
      - Friendly : precision ~1.0, пусть с маленьким recall
    """
    prec, rec, thr = precision_recall_curve(y_true, y_pred)

    # Balanced = лучший F1
    t_best, p_best, r_best, f1_best = find_best_f1_threshold(y_true, y_pred)

    # Aggressive: высокий recall (>= 0.95), выбираем порог с максимальным F1 среди таких
    best_f1_aggr = -1
    t_aggr = 0.0
    p_aggr = rec_aggr = 0.0
    for p, r, t in zip(prec, rec, np.append(thr, 1.0)):
        if r < 0.95:
            continue
        if p + r == 0:
            continue
        f1 = 2 * p * r / (p + r)
        if f1 > best_f1_aggr:
            best_f1_aggr = f1
            t_aggr, p_aggr, rec_aggr = t, p, r

    # если так и не нашли (слишком мало точек с recall>=0.95) — fallback: минимальный порог
    if best_f1_aggr < 0:
        t_aggr = 0.0
        p_aggr = prec[0]
        rec_aggr = rec[0]
        best_f1_aggr = 2 * p_aggr * rec_aggr / (p_aggr + rec_aggr + 1e-9)

    # Friendly: precision ≈ 1.0
    target_prec = 0.99
    best_t_friendly = 1.0
    best_p_friendly = 1.0
    best_r_friendly = 0.0
    for p, r, t in zip(prec, rec, np.append(thr, 1.0)):
        if p >= target_prec and r > best_r_friendly:
            best_t_friendly = t
            best_p_friendly = p
            best_r_friendly = r

    # если precision никогда не доходил до 0.99 — берём максимальный порог
    if best_r_friendly == 0.0:
        best_t_friendly = 1.0
        best_p_friendly = prec[-1]
        best_r_friendly = rec[-1]

    thresholds = {
        "aggressive": {
            "threshold": float(t_aggr),
            "precision": float(p_aggr),
            "recall": float(rec_aggr),
            "f1": float(best_f1_aggr),
        },
        "balanced": {
            "threshold": float(t_best),
            "precision": float(p_best),
            "recall": float(r_best),
            "f1": float(f1_best),
        },
        "friendly": {
            "threshold": float(best_t_friendly),
            "precision": float(best_p_friendly),
            "recall": float(best_r_friendly),
            "f1": float(
                2 * best_p_friendly * best_r_friendly /
                (best_p_friendly + best_r_friendly + 1e-9)
            ),
        },
    }

    print("\n=== Пороги стратегий по OOF для Graph Brain v11 ===")
    for name, info in thresholds.items():
        print(
            f"{name.capitalize()}: threshold={info['threshold']:.3f}, "
            f"precision={info['precision']:.3f}, "
            f"recall={info['recall']:.3f}, "
            f"f1={info['f1']:.3f}"
        )

    return thresholds

# ---------------------------
# 4. OOF-обучение Graph Brain v11
# ---------------------------

print("\n================ OOF-обучение Graph Brain v11 ================")

skf = StratifiedKFold(
    n_splits=N_FOLDS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

oof_pred = np.zeros(len(df))
fold_metrics = []
best_iterations = []

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), start=1):
    y_trn = y[trn_idx]
    y_val = y[val_idx]

    pos = y_trn.sum()
    neg = len(y_trn) - pos
    scale_pos_weight = neg / max(pos, 1)

    print(f"\n=== Fold {fold}/{N_FOLDS} (Graph Brain v11) ===")
    print(f"  fold pos={int(pos)}, neg={int(neg)}, scale_pos_weight={scale_pos_weight:.2f}")

    X_trn = X[trn_idx]
    X_val = X[val_idx]

    train_pool = Pool(X_trn, y_trn, feature_names=graph_brain_feature_cols)
    valid_pool = Pool(X_val, y_val, feature_names=graph_brain_feature_cols)

    params = {
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "custom_metric": ["AUC", "PRAUC"],
        "learning_rate": 0.03,
        "depth": 6,
        "l2_leaf_reg": 5.0,
        "iterations": 2000,
        "random_seed": RANDOM_STATE + fold,
        "logging_level": "Verbose",
        "od_type": "Iter",
        "od_wait": 100,
        "scale_pos_weight": scale_pos_weight,
        "task_type": "CPU",
        "thread_count": -1,
    }

    model = CatBoostClassifier(**params)
    model.fit(
        train_pool,
        eval_set=valid_pool,
        verbose=100,
        use_best_model=True,
    )

    best_iter = model.get_best_iteration()
    if best_iter is None or best_iter <= 0:
        best_iter = params["iterations"]
    best_iterations.append(best_iter)

    val_pred = model.predict_proba(valid_pool)[:, 1]
    oof_pred[val_idx] = val_pred

    roc_f = roc_auc_score(y_val, val_pred)
    pr_f = average_precision_score(y_val, val_pred)
    fold_metrics.append((fold, roc_f, pr_f))

    print(f"  Fold ROC-AUC={roc_f:.4f}, PR-AUC={pr_f:.4f}")

# OOF-метрики
print("\n=== CV по фолдам для Graph Brain v11 ===")
for fold, roc_f, pr_f in fold_metrics:
    print(f"  Fold {fold}: ROC-AUC={roc_f:.4f}, PR-AUC={pr_f:.4f}")

roc_oof, pr_oof = compute_oof_metrics(y, oof_pred, prefix="OOF ")

# Пороги стратегий
thresholds = pick_strategy_thresholds(y, oof_pred)

# ---------------------------
# 5. Финальная модель на всём датасете
# ---------------------------

print("\nПолный датасет для Graph Brain:")
print(f"  positives: {int(y.sum())} negatives: {int(len(y) - y.sum())}")
scale_pos_weight_full = (len(y) - y.sum()) / max(y.sum(), 1)
print(f"  scale_pos_weight (full): {scale_pos_weight_full:.2f}")

avg_best_iter = int(np.mean(best_iterations))
print(f"Среднее число деревьев по фолдам: {avg_best_iter}")

final_iterations = max(avg_best_iter + 50, 200)
print(f"\n=== Обучение финальной Graph Brain модели на всём датасете ===")
print(f"  iterations={final_iterations}")

train_pool_full = Pool(X, y, feature_names=graph_brain_feature_cols)

final_params = {
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "custom_metric": ["AUC", "PRAUC"],
    "learning_rate": 0.03,
    "depth": 6,
    "l2_leaf_reg": 5.0,
    "iterations": final_iterations,
    "random_seed": RANDOM_STATE,
    "logging_level": "Verbose",
    "scale_pos_weight": scale_pos_weight_full,
    "task_type": "CPU",
    "thread_count": -1,
}

final_model = CatBoostClassifier(**final_params)
final_model.fit(train_pool_full, verbose=100)

final_model.save_model(GRAPH_MODEL_PATH)
print(f"\nМодель Graph Brain v11 сохранена в {GRAPH_MODEL_PATH}")

# ---------------------------
# 6. Важности фич
# ---------------------------

importances = final_model.get_feature_importance(type="FeatureImportance")
fi = pd.DataFrame({
    "feature": graph_brain_feature_cols,
    "importance": importances,
}).sort_values("importance", ascending=False)

fi_path = "data/processed/graph_brain_feature_importance_v11.parquet"
os.makedirs(os.path.dirname(fi_path), exist_ok=True)
fi.to_parquet(fi_path, index=False)
print(f"Важности фич Graph Brain сохранены в {fi_path}")

print("\nТОП-20 фич Graph Brain v11:")
print(fi.head(20))

# ---------------------------
# 7. Сохранение порогов и обновление feature store
# ---------------------------

with open(GRAPH_THRESHOLDS_CONFIG, "w", encoding="utf-8") as f:
    json.dump(
        {
            "version": "v11",
            "thresholds": thresholds,
            "oof_metrics": {
                "roc_auc": float(roc_oof),
                "pr_auc": float(pr_oof),
                "baseline_pr": float(y.mean()),
            },
        },
        f,
        ensure_ascii=False,
        indent=2,
    )

print(f"\nПороги Graph Brain сохранены в {GRAPH_THRESHOLDS_CONFIG}")

# Пересчёт предиктов финальной модели на всём feature store
df_graph = pd.read_parquet(FEATURES_PARQUET)

# удаляем старые graph_brain_* если есть
drop_cols = [c for c in df_graph.columns if c.startswith("graph_brain_")]
if drop_cols:
    print("\nУдаляем старые колонки Graph Brain:", drop_cols)
    df_graph = df_graph.drop(columns=drop_cols)

X_full_graph = df_graph[graph_brain_feature_cols].values
full_pool = Pool(X_full_graph, feature_names=graph_brain_feature_cols)

df_graph["graph_brain_oof_v11"] = oof_pred
df_graph["graph_brain_proba_v11"] = final_model.predict_proba(full_pool)[:, 1]

df_graph.to_parquet(FEATURES_PARQUET, index=False)
print(f"\n✅ Обновлённый feature store с Graph Brain v11 сохранён в {FEATURES_PARQUET}")

print("\nГотово: Graph Brain v11 (base+graph, без sess/node2vec) обучен и интегрирован.")
print(f"OOF ROC-AUC: {roc_oof:.4f}, OOF PR-AUC: {pr_oof:.4f}")

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
DEVICE: cpu (Graph Brain v11)
Пробуем прочитать Parquet: data/processed/features_offline_v11.parquet
✅ Успешно прочитан как Parquet.
Всего строк: 13113
Колонок: 199
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstine