In [None]:
import json
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

PROJECT_ROOT = Path(".")
DATA_ROOT = PROJECT_ROOT / "data"
PROC_PATH = DATA_ROOT / "processed"
MODELS_PATH = PROJECT_ROOT / "models"
CONFIG_PATH = PROJECT_ROOT / "config"

FEATURES_OFFLINE_PATH = PROC_PATH / "features_offline_v11.parquet"
META_MODEL_PATH = MODELS_PATH / "risk_meta_vprod.pkl"
META_FEATURES_PATH = CONFIG_PATH / "meta_features_vprod.json"
META_THRESHOLDS_PATH = CONFIG_PATH / "meta_thresholds_vprod.json"
META_IMPORTANCE_PATH = PROC_PATH / "meta_feature_importance_vprod.parquet"

MODELS_PATH.mkdir(parents=True, exist_ok=True)
CONFIG_PATH.mkdir(parents=True, exist_ok=True)

print("DEVICE: cpu (Meta Brain vProd — 5 OOF голов)")

df = pd.read_parquet(FEATURES_OFFLINE_PATH)

print(f"Читаем feature store: {FEATURES_OFFLINE_PATH}")
print(f"Всего строк: {df.shape[0]}")
print(f"Колонок: {df.shape[1]}")
print("Первые колонки:", df.columns[:25].tolist(), "...")

target_col = "target"
df[target_col] = df[target_col].astype(int)

print("\nРаспределение target:")
print(df[target_col].value_counts())
print("Доля фрода:", df[target_col].mean())

brain_oof_feats = [
    "risk_fast_oof_v11",     # Fast Gate (OOF)
    "risk_ae_oof_v11",       # AE-head (OOF)
    "graph_brain_oof_v11",   # Graph Brain (OOF)
    "risk_seq_oof_v11",      # Sequence Brain (OOF)
    "risk_sess_oof_v11",     # Session Brain (OOF)
]

missing_brain = [c for c in brain_oof_feats if c not in df.columns]
if missing_brain:
    raise ValueError(f"В feature store отсутствуют OOF-фичи мозгов: {missing_brain}")

print("\nBrain OOF фичи для Meta_vProd:", brain_oof_feats, "\n")

anomaly_feats = [
    "ae_log_recon_error_v11",
    "seq_log_recon_error_v11",
    "seq_hist_len_v11",
]
anomaly_feats = [c for c in anomaly_feats if c in df.columns]
print("Anomaly-фичи (AE/Seq):", anomaly_feats, "\n")

context_numeric_candidates = [
    "log_amount",
    "z_amount_30d",
    "cst_fraud_share",
    "dir_fraud_share",
    "degree_cst",
    "degree_dir",
    "user_tx_1m",
    "user_tx_10m",
    "user_tx_60m",
    "user_sum_60m",
    "user_tx_count_7d",
    "user_tx_count_30d",
    "user_tx_count_90d",
    "sess_logins_7d",
    "sess_logins_30d",
    "sess_logins_7d_30d_ratio",
    "sess_login_freq_7d",
    "sess_login_freq_30d",
    "hour",
    "dayofweek",
    "is_weekend",
]
context_numeric_feats = [c for c in context_numeric_candidates if c in df.columns]

print("Контекстные numeric-фичи:", context_numeric_feats, "\n")

meta_features = brain_oof_feats + anomaly_feats + context_numeric_feats
meta_features = list(dict.fromkeys(meta_features))  # на всякий случай удаляем дубли

print(f"Всего meta-фич для Meta_vProd: {len(meta_features)}")
print(meta_features, "\n")

X_meta = df[meta_features].copy()
y = df[target_col].values

n_rows_with_nan = X_meta.isna().any(axis=1).sum()
print(
    f"Строк с хотя бы одним NaN в meta-фичах: {n_rows_with_nan} "
    "(будут заимпутированы медианой в пайплайне)."
)

print("\nТипы X_meta.dtypes:")
print(X_meta.dtypes.value_counts(), "\n")

def compute_threshold_table(y_true, y_score, num_thresholds: int = 300) -> pd.DataFrame:
    """
    Строим таблицу:
      threshold, precision, recall, f1
    по сетке порогов (по квантилям скоринга).
    """
    y_true = np.asarray(y_true)
    y_score = np.asarray(y_score)

    qs = np.linspace(0.0, 1.0, num_thresholds)
    thresholds = np.unique(np.quantile(y_score, qs))

    rows = []
    pos_total = (y_true == 1).sum()

    for thr in thresholds:
        y_pred = (y_score >= thr).astype(int)

        tp = ((y_true == 1) & (y_pred == 1)).sum()
        fp = ((y_true == 0) & (y_pred == 1)).sum()

        if tp + fp == 0:
            precision = 0.0
        else:
            precision = tp / (tp + fp)

        if pos_total == 0:
            recall = 0.0
        else:
            recall = tp / pos_total

        if precision + recall == 0:
            f1 = 0.0
        else:
            f1 = 2 * precision * recall / (precision + recall)

        rows.append((thr, precision, recall, f1))

    table = pd.DataFrame(rows, columns=["threshold", "precision", "recall", "f1"])
    table = table.sort_values("threshold").reset_index(drop=True)
    return table


def choose_thresholds_by_table(
    table: pd.DataFrame,
    aggressive_recall_target: float = 0.90,
    friendly_precision_target: float = 0.95,
):

    
    idx_bal = table["f1"].idxmax()
    row_bal = table.loc[idx_bal]

    df_aggr = table[table["recall"] >= aggressive_recall_target]
    if df_aggr.empty:
        idx_aggr = table["recall"].idxmax()
    else:
        df_aggr2 = df_aggr.sort_values(["recall", "f1"], ascending=[False, False])
        idx_aggr = df_aggr2.index[0]
    row_aggr = table.loc[idx_aggr]

    df_friend = table[table["precision"] >= friendly_precision_target]
    if df_friend.empty:
        idx_fr = table["precision"].idxmax()
    else:
        df_friend2 = df_friend.sort_values(["f1", "threshold"], ascending=[False, True])
        idx_fr = df_friend2.index[0]
    row_friend = table.loc[idx_fr]

    return row_aggr, row_bal, row_friend

N_SPLITS = 5
RANDOM_STATE = 42

skf = StratifiedKFold(
    n_splits=N_SPLITS,
    shuffle=True,
    random_state=RANDOM_STATE,
)

pos = (y == 1).sum()
neg = (y == 0).sum()
pos_frac = pos / (pos + neg)

print(
    "\n================ OOF-обучение Meta Brain vProd (LogisticRegression) ================\n"
    f"Всего объектов: {len(y)}, pos={pos}, neg={neg}, pos_frac={pos_frac:.6f}"
)

pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        (
            "clf",
            LogisticRegression(
                solver="lbfgs",
                max_iter=2000,
                class_weight="balanced",
            ),
        ),
    ]
)

oof_pred = np.zeros(len(y), dtype=float)
cv_metrics = []

for fold, (train_idx, valid_idx) in enumerate(skf.split(X_meta, y), start=1):
    X_trn, X_val = X_meta.iloc[train_idx], X_meta.iloc[valid_idx]
    y_trn, y_val = y[train_idx], y[valid_idx]

    pipeline_fold = pipeline
    pipeline_fold.fit(X_trn, y_trn)

    proba_val = pipeline_fold.predict_proba(X_val)[:, 1]
    oof_pred[valid_idx] = proba_val

    fold_auc = roc_auc_score(y_val, proba_val)
    fold_prauc = average_precision_score(y_val, proba_val)
    cv_metrics.append((fold_auc, fold_prauc))

    print(
        f"  Fold {fold}/{N_SPLITS} ROC-AUC={fold_auc:.4f}, "
        f"PR-AUC={fold_prauc:.4f}"
    )

oof_auc = roc_auc_score(y, oof_pred)
oof_prauc = average_precision_score(y, oof_pred)
baseline_prauc = y.mean()

print("\n=== CV по фолдам для Meta_vProd ===")
for i, (auc_f, pr_f) in enumerate(cv_metrics, start=1):
    print(f"  Fold {i}: ROC-AUC={auc_f:.4f}, PR-AUC={pr_f:.4f}")

print(f"OOF ROC-AUC: {oof_auc:.4f}")
print(f"OOF PR-AUC : {oof_prauc:.4f}")
print(f"OOF Baseline PR-AUC (random): {baseline_prauc:.6f}")

print("\nСчитаем таблицу метрик по порогам для Meta_vProd (по OOF)...")
thr_table = compute_threshold_table(y, oof_pred, num_thresholds=300)

print("\nПример метрик по порогам (первые 10 строк):")
print(thr_table.head(10))

row_aggr, row_bal, row_friend = choose_thresholds_by_table(
    thr_table,
    aggressive_recall_target=0.90,
    friendly_precision_target=0.95,
)

print("\n=== Пороги стратегий по OOF для Meta Brain vProd ===")
print(
    f"Aggressive: threshold={row_aggr['threshold']:.3f}, "
    f"precision={row_aggr['precision']:.3f}, recall={row_aggr['recall']:.3f}, f1={row_aggr['f1']:.3f}"
)
print(
    f"Balanced: threshold={row_bal['threshold']:.3f}, "
    f"precision={row_bal['precision']:.3f}, recall={row_bal['recall']:.3f}, f1={row_bal['f1']:.3f}"
)
print(
    f"Friendly: threshold={row_friend['threshold']:.3f}, "
    f"precision={row_friend['precision']:.3f}, recall={row_friend['recall']:.3f}, f1={row_friend['f1']:.3f}"
)

thresholds_payload = {
    "aggressive": {
        "threshold": float(row_aggr["threshold"]),
        "precision": float(row_aggr["precision"]),
        "recall": float(row_aggr["recall"]),
        "f1": float(row_aggr["f1"]),
    },
    "balanced": {
        "threshold": float(row_bal["threshold"]),
        "precision": float(row_bal["precision"]),
        "recall": float(row_bal["recall"]),
        "f1": float(row_bal["f1"]),
    },
    "friendly": {
        "threshold": float(row_friend["threshold"]),
        "precision": float(row_friend["precision"]),
        "recall": float(row_friend["recall"]),
        "f1": float(row_friend["f1"]),
    },
}

with open(META_THRESHOLDS_PATH, "w", encoding="utf-8") as f:
    json.dump(thresholds_payload, f, ensure_ascii=False, indent=2)

print(f"\nПороги стратегий Meta Brain vProd сохранены в {META_THRESHOLDS_PATH}")

df["risk_meta_oof_vprod"] = oof_pred.astype("float32")
df.to_parquet(FEATURES_OFFLINE_PATH, index=False)

print(
    f"\nОбновлённый feature store с Meta Brain vProd сохранён в {FEATURES_OFFLINE_PATH}\n"
    "Добавлена колонка: risk_meta_oof_vprod"
)

print("\n=== Обучение финальной Meta Brain vProd модели на всём датасете ===")

final_pipeline = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
        (
            "clf",
            LogisticRegression(
                solver="lbfgs",
                max_iter=2000,
                class_weight="balanced",
            ),
        ),
    ]
)

final_pipeline.fit(X_meta, y)
joblib.dump(final_pipeline, META_MODEL_PATH)

print(f"Meta-модель vProd сохранена в {META_MODEL_PATH}")

with open(META_FEATURES_PATH, "w", encoding="utf-8") as f:
    json.dump(meta_features, f, ensure_ascii=False, indent=2)

print(f"Список фич meta-модели vProd сохранён в {META_FEATURES_PATH}")


clf = final_pipeline.named_steps["clf"]
coefs = clf.coef_[0]
importance = np.abs(coefs)

importance_df = pd.DataFrame(
    {
        "feature": meta_features,
        "importance": importance,
        "coef": coefs,
    }
).sort_values("importance", ascending=False)

importance_df.to_parquet(META_IMPORTANCE_PATH, index=False)
print(f"Важности фич Meta Brain vProd сохранены в {META_IMPORTANCE_PATH}")

print(
    "\nГотово: Meta Brain vProd (risk_meta_vprod) построен и интегрирован.\n"
    f" - OOF ROC-AUC: {oof_auc:.4f}\n"
    f" - OOF PR-AUC : {oof_prauc:.4f}\n"
    f" - Кол-во meta-фич: {len(meta_features)}\n"
    " - OOF колонка: risk_meta_oof_vprod\n"
    " - Продовый скор: risk_meta_score_vprod (через сохранённую модель)"
)


DEVICE: cpu (Meta Brain vProd — 5 OOF голов)
Читаем feature store: data/processed/features_offline_v11.parquet
Всего строк: 13113
Колонок: 210
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstiness_login_interval', 'sess_fano_login_interval', 'sess_z_login_interval_7d', 'sess_has_login_history', 'sess_last_phone_model'] ...

Распределение target:
target
0    12948
1      165
Name: count, dtype: int64
Доля фрода: 0.012582932967284374

Brain OOF фичи для Meta_vProd: ['risk_fast_oof_v11', 'risk_ae_oof_v11', 'graph_brain_oof_v11', 'risk_seq_oof_v11', 'risk_sess_oof_v11'] 

Anomaly-фичи (AE/Seq):