In [None]:
!pip install catboost


import os
import json
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore")

SEED = 42
RNG = np.random.RandomState(SEED)

DATA_DIR = Path("data/processed")
CONFIG_DIR = Path("config")
MODELS_DIR = Path("models")

CONFIG_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

FEATURE_STORE_PATH = DATA_DIR / "features_offline_v11.parquet"

print("DEVICE: cpu (Session Brain v11)")


print(f"Читаем feature store: {FEATURE_STORE_PATH}")
df = pd.read_parquet(FEATURE_STORE_PATH)

print(f"Всего строк: {len(df)}")
print(f"Колонок: {df.shape[1]}")
print("Первые колонки:", list(df.columns[:25]), "...")

target_col = "target"
if target_col not in df.columns:
    raise ValueError(f"Колонка target не найдена в feature store ({target_col})")

print("\nРаспределение target:")
print(df[target_col].value_counts())
fraud_share = df[target_col].mean()
print("Доля фрода:", fraud_share)


all_cols = df.columns.tolist()

def existing(cols):
    """Вернуть только существующие в df колонки, сохраняя порядок."""
    return [c for c in cols if c in df.columns]

sess_profile_cols = [
    "sess_monthly_os_changes",
    "sess_monthly_phone_model_changes",
    "sess_logins_7d",
    "sess_logins_30d",
    "sess_login_freq_7d",
    "sess_login_freq_30d",
    "sess_freq_change_7d_vs_mean",
    "sess_logins_7d_30d_ratio",
    "sess_avg_login_interval_30d",
    "sess_std_login_interval_30d",
    "sess_var_login_interval_30d",
    "sess_ewm_login_interval_7d",
    "sess_burstiness_login_interval",
    "sess_fano_login_interval",
    "sess_z_login_interval_7d",
    "sess_has_login_history",
]
sess_cat_cols = [
    "sess_last_phone_model",
    "sess_last_os",
]

sess_profile_cols = existing(sess_profile_cols)
sess_cat_cols = existing(sess_cat_cols)

time_cols = existing([
    "hour",
    "dayofweek",
    "is_weekend",
    "hour_sin",
    "hour_cos",
    "dow_sin",
    "dow_cos",
])

short_window_cols = existing([
    "user_tx_1m",
    "user_tx_10m",
    "user_tx_60m",
    "user_sum_60m",
    "user_tx_count_7d",
    "user_tx_count_30d",
    "user_tx_count_90d",
])

graph_local_cols = existing([
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "dir_tx_60m",
    "dir_unique_senders_60m",
    "user_new_dirs_60m",
    "many_to_one_flag",
    "one_to_many_flag",
])

anomaly_cols = existing([
    "ae_log_recon_error_v11",
    "ae_z_recon_error_v11",
    "ae_percentile_v11",
    "seq_log_recon_error_v11",
    "seq_z_recon_error_v11",
    "seq_hist_len_v11",
])

session_feature_cols = list(dict.fromkeys(
    sess_profile_cols
    + sess_cat_cols
    + time_cols
    + short_window_cols
    + graph_local_cols
    + anomaly_cols
))

print("\nРазмеры групп фич для Session Brain:")
print("  sess_profile_cols:", len(sess_profile_cols))
print("  sess_cat_cols    :", len(sess_cat_cols))
print("  time_cols        :", len(time_cols))
print("  short_window_cols:", len(short_window_cols))
print("  graph_local_cols :", len(graph_local_cols))
print("  anomaly_cols     :", len(anomaly_cols))

print("\nФинальные фичи Session Brain v11:")
print(f"Всего session-фич: {len(session_feature_cols)}")
print(session_feature_cols)

session_features_config_path = CONFIG_DIR / "session_features_v11.json"
with open(session_features_config_path, "w", encoding="utf-8") as f:
    json.dump(session_feature_cols, f, ensure_ascii=False, indent=2)
print(f"\nСхема фич Session Brain сохранена в {session_features_config_path}")


X = df[session_feature_cols]
y = df[target_col].astype(int).values

print("\nРазмер X_session:", X.shape)
print("Классы y:", np.bincount(y))

cat_feature_names = [c for c in sess_cat_cols if c in session_feature_cols]
cat_features_indices = [session_feature_cols.index(c) for c in cat_feature_names]

print("\nКатегориальные фичи Session Brain:", cat_feature_names)
print("Индексы категориальных фичей:", cat_features_indices)


def build_threshold_table(y_true, scores, n_points=500):
    """Построить таблицу (threshold, precision, recall, f1) по квантилям скорингов."""
    y_true = np.asarray(y_true).astype(int)
    scores = np.asarray(scores).astype(float)

    scores = np.nan_to_num(scores, nan=0.0, posinf=0.0, neginf=0.0)

    qs = np.linspace(0.0, 1.0, n_points)
    thresholds = np.unique(np.quantile(scores, qs))

    rows = []
    pos = (y_true == 1).sum()
    neg = (y_true == 0).sum()

    for thr in thresholds:
        y_pred = (scores >= thr).astype(int)
        tp = int(((y_true == 1) & (y_pred == 1)).sum())
        fp = int(((y_true == 0) & (y_pred == 1)).sum())
        fn = int(((y_true == 1) & (y_pred == 0)).sum())

        prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        rec = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0.0

        rows.append((thr, prec, rec, f1))

    thr_table = pd.DataFrame(rows, columns=["threshold", "precision", "recall", "f1"])
    return thr_table


def choose_strategic_thresholds(thr_table: pd.DataFrame):
    """
    Выбор 3 стратегических порогов:
      - Aggressive: максимально высокий recall (>= 0.9), среди них максимальный F1
      - Balanced : глобальный максимум F1
      - Friendly : максимально консервативный порог с precision >= 0.98 (или максимум thresholds)
    """
    t = thr_table.sort_values("threshold").reset_index(drop=True)

    idx_best = t["f1"].idxmax()
    row_bal = t.loc[idx_best]

    cand_agg = t[t["recall"] >= 0.9]
    if not cand_agg.empty:
        idx_agg = cand_agg["f1"].idxmax()
        row_agg = t.loc[idx_agg]
    else:
        row_agg = t.iloc[0]

    cand_friendly = t[t["precision"] >= 0.98]
    if not cand_friendly.empty:
        
        idx_fr = cand_friendly["threshold"].idxmax()
        row_fr = t.loc[idx_fr]
    else:
        row_fr = t.iloc[-1]

    def row_to_dict(row):
        return {
            "threshold": float(row["threshold"]),
            "precision": float(row["precision"]),
            "recall": float(row["recall"]),
            "f1": float(row["f1"]),
        }

    return {
        "Aggressive": row_to_dict(row_agg),
        "Balanced": row_to_dict(row_bal),
        "Friendly": row_to_dict(row_fr),
    }


print("\n================ OOF-обучение Session Brain v11 (CatBoost) ================")
n_samples = len(df)
pos_total = int((y == 1).sum())
neg_total = int((y == 0).sum())
scale_pos_weight_full = neg_total / max(pos_total, 1)
print(f"Всего объектов: {n_samples}, pos={pos_total}, neg={neg_total}, "
      f"scale_pos_weight≈{scale_pos_weight_full:.2f}")

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_pred = np.zeros(n_samples, dtype=float)
fold_metrics = []
best_iterations = []

for fold, (trn_idx, val_idx) in enumerate(skf.split(X, y), start=1):
    X_trn = X.iloc[trn_idx]
    y_trn = y[trn_idx]
    X_val = X.iloc[val_idx]
    y_val = y[val_idx]

    pos_trn = int((y_trn == 1).sum())
    neg_trn = int((y_trn == 0).sum())
    scale_pos_weight = neg_trn / max(pos_trn, 1)

    print(f"\n=== Fold {fold}/{N_FOLDS} (Session Brain v11) ===")
    print(f"  fold pos={pos_trn}, neg={neg_trn}, scale_pos_weight={scale_pos_weight:.2f}")

    train_pool = Pool(X_trn, y_trn, cat_features=cat_features_indices)
    valid_pool = Pool(X_val, y_val, cat_features=cat_features_indices)

    params = dict(
        loss_function="Logloss",
        eval_metric="AUC",
        learning_rate=0.05,
        depth=6,
        l2_leaf_reg=3.0,
        random_strength=1.5,
        border_count=128,
        scale_pos_weight=scale_pos_weight,
        iterations=1200,
        random_seed=SEED + fold,
        bagging_temperature=0.5,
        od_type="Iter",
        od_wait=200,
        use_best_model=True,
        task_type="CPU",
        verbose=100,
    )

    model_fold = CatBoostClassifier(**params)
    model_fold.fit(train_pool, eval_set=valid_pool, verbose=100)

    best_iter = model_fold.get_best_iteration()
    if best_iter == -1:
        best_iter = model_fold.tree_count_
    best_iterations.append(best_iter)

    proba_val = model_fold.predict_proba(valid_pool)[:, 1]
    oof_pred[val_idx] = proba_val

    roc = roc_auc_score(y_val, proba_val)
    pr = average_precision_score(y_val, proba_val)

    print(f"  Fold ROC-AUC={roc:.4f}, PR-AUC={pr:.4f}")
    print(f"  Best iterations={best_iter}")

    fold_metrics.append((roc, pr))

oof_roc = roc_auc_score(y, oof_pred)
oof_pr = average_precision_score(y, oof_pred)
baseline_pr = y.mean()

print("\n=== CV по фолдам для Session Brain v11 ===")
for i, (roc, pr) in enumerate(fold_metrics, start=1):
    print(f"  Fold {i}: ROC-AUC={roc:.4f}, PR-AUC={pr:.4f}")
print(f"OOF ROC-AUC: {oof_roc:.4f}")
print(f"OOF PR-AUC : {oof_pr:.4f}")
print(f"OOF Baseline PR-AUC (random): {baseline_pr:.6f}")


thr_table = build_threshold_table(y, oof_pred, n_points=500)
print("\nПример метрик по порогам (первые 10 строк):")
print(thr_table.head(10))

strategy_thresholds = choose_strategic_thresholds(thr_table)
print("\n=== Пороги стратегий по OOF для Session Brain v11 ===")
for name, info in strategy_thresholds.items():
    print(f"{name}: threshold={info['threshold']:.3f}, "
          f"precision={info['precision']:.3f}, "
          f"recall={info['recall']:.3f}, f1={info['f1']:.3f}")


avg_best_iter = int(np.ceil(np.mean(best_iterations))) if best_iterations else 400
print(f"\nСреднее число деревьев по фолдам: {np.mean(best_iterations):.1f}")
print(f"Финальное число итераций для полного обучения: {avg_best_iter}")

train_pool_full = Pool(X, y, cat_features=cat_features_indices)

final_params = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3.0,
    random_strength=1.5,
    border_count=128,
    scale_pos_weight=scale_pos_weight_full,
    iterations=avg_best_iter,
    random_seed=SEED,
    bagging_temperature=0.5,
    task_type="CPU",
    verbose=100,
)

print("\n=== Обучение финальной Session Brain v11 модели на всём датасете ===")
session_model = CatBoostClassifier(**final_params)
session_model.fit(train_pool_full, verbose=100)


session_model_path = MODELS_DIR / "session_brain_v11.cbm"
session_model.save_model(session_model_path)
print(f"Session Brain модель сохранена в {session_model_path}")

session_thresholds_path = CONFIG_DIR / "session_thresholds_v11.json"
with open(session_thresholds_path, "w", encoding="utf-8") as f:
    json.dump(strategy_thresholds, f, ensure_ascii=False, indent=2)
print(f"Пороги Session Brain v11 сохранены в {session_thresholds_path}")

# Важности фич
importances = session_model.get_feature_importance(train_pool_full, type="FeatureImportance")
fi_df = pd.DataFrame({"feature": session_feature_cols, "importance": importances})
fi_df = fi_df.sort_values("importance", ascending=False).reset_index(drop=True)

fi_path = DATA_DIR / "session_feature_importance_v11.parquet"
fi_df.to_parquet(fi_path, index=False)
print(f"\nВажности фич Session Brain сохранены в {fi_path}")

print("\nТОП-20 фич Session Brain v11:")
print(fi_df.head(20))


df["risk_sess_oof_v11"] = oof_pred

proba_full = session_model.predict_proba(train_pool_full)[:, 1]
df["risk_sess_v11"] = proba_full

df.to_parquet(FEATURE_STORE_PATH, index=False)
print(f"\nОбновлённый feature store с Session Brain v11 сохранён в {FEATURE_STORE_PATH}")

print("\nГотово: Session Brain v11 (session-focused CatBoost) построен и интегрирован.")
print(f" - OOF ROC-AUC: {oof_roc:.4f}")
print(f" - OOF PR-AUC : {oof_pr:.4f}")
print(f" - Кол-во session-фич: {len(session_feature_cols)}")
print(" - OOF колонка: risk_sess_oof_v11")
print(" - Продовый скор: risk_sess_v11")


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
DEVICE: cpu (Session Brain v11)
Читаем feature store: data/processed/features_offline_v11.parquet
Всего строк: 13113
Колонок: 208
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstiness_login_interval', 'sess_fano_login