In [None]:
import os
import json
from pathlib import Path
from copy import deepcopy

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    precision_recall_fscore_support
)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import joblib


RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"DEVICE: {DEVICE}")

ROOT = Path(".")
DATA_ROOT = ROOT / "data"
PROC_PATH = DATA_ROOT / "processed"
MODELS_PATH = ROOT / "models"
CONFIG_PATH = ROOT / "config"

for p in [DATA_ROOT, PROC_PATH, MODELS_PATH, CONFIG_PATH]:
    p.mkdir(parents=True, exist_ok=True)

FEATURES_PARQUET = PROC_PATH / "features_offline_v11.parquet"

print(f"Пробуем прочитать Parquet: {FEATURES_PARQUET}")
df = pd.read_parquet(FEATURES_PARQUET)
print("Успешно прочитан как Parquet.")
print("Всего строк:", len(df))
print("Колонок:", df.shape[1])
print("Первые колонки:", list(df.columns[:25]), "...")

print("\nРаспределение target по всему датасету:")
print(df["target"].value_counts())
fraud_share = df["target"].mean()
print("Доля фрода:", fraud_share)


all_cols = df.columns.tolist()

id_cols = [
    "transdatetime",
    "transdate",
    "cst_dim_id",
    "docno",
    "direction",
    "row_id"
]

sess_cols = [c for c in all_cols if c.startswith("sess_")]
emb_cst_cols = [c for c in all_cols if c.startswith("emb_cst_")]
emb_dir_cols = [c for c in all_cols if c.startswith("emb_dir_")]

graph_cols = [
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
]

excluded_from_base = (
    set(id_cols)
    | set(sess_cols)
    | set(emb_cst_cols)
    | set(emb_dir_cols)
    | set(graph_cols)
    | {"target"}
)

base_cols = [
    c for c in all_cols
    if c not in excluded_from_base
]

print("\nРазмеры групп фич:")
print("  base_cols:", len(base_cols))
print("  graph_cols:", len(graph_cols))
print("  sess_cols:", len(sess_cols))
print("  emb_cst_cols:", len(emb_cst_cols))
print("  emb_dir_cols:", len(emb_dir_cols))


sess_numeric_cols = [
    c for c in sess_cols
    if df[c].dtype != "object"
]

ae_feature_cols = base_cols + graph_cols + sess_numeric_cols

ae_feature_cols = [
    c for c in ae_feature_cols
    if c not in {
        "ae_recon_error_v11",
        "ae_log_recon_error_v11",
        "ae_z_recon_error_v11",
        "ae_percentile_v11",
        "risk_ae_oof_v11",
        "risk_ae_v11",
        "anomaly_score_emb",
        "ae_percentile",
        "risk_ae"
    }
]

ae_feature_cols = [
    c for c in ae_feature_cols
    if np.issubdtype(df[c].dtype, np.number)
]

ae_feature_cols = sorted(ae_feature_cols)

print("\nРазмерность варианта F_full_without_node2vec (без node2vec):")
print("  base + graph + session =", len(base_cols), "+", len(graph_cols), "+", len(sess_numeric_cols),
      "=", len(base_cols) + len(graph_cols) + len(sess_numeric_cols))

print("\nФинальные фичи для Autoencoder (только числовые):", len(ae_feature_cols))
print(ae_feature_cols)

# Схема фич AE
ae_features_config_path = CONFIG_PATH / "autoencoder_features_v11.json"
with ae_features_config_path.open("w", encoding="utf-8") as f:
    json.dump(ae_feature_cols, f, ensure_ascii=False, indent=2)
print(f"\nСхема фич Autoencoder сохранена в {ae_features_config_path}")


df_ae = df[ae_feature_cols].copy()

df_ae = df_ae.replace([np.inf, -np.inf], np.nan)

medians = df_ae.median(axis=0)
df_ae = df_ae.fillna(medians)

X_all = df_ae.values.astype(np.float32)
y_all = df["target"].values.astype(int)

mask_clean = (y_all == 0)
X_clean = X_all[mask_clean]

print("\nРазмер X_all:", X_all.shape)
print("Размер X_clean (target=0):", X_clean.shape)

X_train, X_val = train_test_split(
    X_clean,
    test_size=0.1,
    random_state=RANDOM_STATE,
    shuffle=True,
)

print("Train normal shape:", X_train.shape)
print("Val normal shape:", X_val.shape)

scaler_ae = StandardScaler()
scaler_ae.fit(X_train)

X_train_scaled = scaler_ae.transform(X_train)
X_val_scaled = scaler_ae.transform(X_val)
X_all_scaled = scaler_ae.transform(X_all)

scaler_path = MODELS_PATH / "tx_scaler_v11.pkl"
joblib.dump(scaler_ae, scaler_path)
print(f"\nСкейлер AE сохранён в {scaler_path}")


class TabDataset(Dataset):
    def __init__(self, X: np.ndarray):
        self.X = torch.from_numpy(X).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return self.X[idx]

batch_size = 256

train_ds = TabDataset(X_train_scaled)
val_ds = TabDataset(X_val_scaled)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, drop_last=False)


input_dim = X_train_scaled.shape[1]
print("\nInput dim AE:", input_dim)

class AutoEncoder(nn.Module):
    def __init__(self, input_dim: int):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(32, 16),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.Linear(16, 32),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.Linear(32, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, input_dim),
        )

    def forward(self, x):
        z = self.encoder(x)
        out = self.decoder(z)
        return out

model_ae = AutoEncoder(input_dim=input_dim).to(DEVICE)
criterion = nn.MSELoss(reduction="mean")
optimizer = torch.optim.Adam(model_ae.parameters(), lr=1e-3, weight_decay=1e-5)

max_epochs = 300
patience = 25

best_val_loss = float("inf")
best_state_dict = None
best_epoch = -1
no_improve_epochs = 0

print("\n================= Обучение Autoencoder =================")

for epoch in range(1, max_epochs + 1):
    model_ae.train()
    train_loss_sum = 0.0
    train_batches = 0

    for xb in train_loader:
        xb = xb.to(DEVICE)
        optimizer.zero_grad()
        recon = model_ae(xb)
        loss = criterion(recon, xb)
        loss.backward()
        optimizer.step()

        train_loss_sum += loss.item()
        train_batches += 1

    train_loss = train_loss_sum / max(1, train_batches)

    model_ae.eval()
    val_loss_sum = 0.0
    val_batches = 0
    with torch.no_grad():
        for xb in val_loader:
            xb = xb.to(DEVICE)
            recon = model_ae(xb)
            loss = criterion(recon, xb)
            val_loss_sum += loss.item()
            val_batches += 1

    val_loss = val_loss_sum / max(1, val_batches)

    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch:03d} | train_loss={train_loss:.6f} | val_loss={val_loss:.6f}")

    if val_loss < best_val_loss - 1e-6:
        best_val_loss = val_loss
        best_state_dict = deepcopy(model_ae.state_dict())
        best_epoch = epoch
        no_improve_epochs = 0
    else:
        no_improve_epochs += 1
        if no_improve_epochs >= patience:
            print(f"Early stopping на эпохе {epoch}, best_epoch={best_epoch}, best_val_loss={best_val_loss:.6f}")
            break

if best_state_dict is not None:
    model_ae.load_state_dict(best_state_dict)
    print(f"Загружено лучшее состояние модели (epoch={best_epoch})")
else:
    print("best_state_dict is None — проверь лоссы (но после NaN-фикса этого быть не должно).")

model_ae.eval()
with torch.no_grad():
    X_all_t = torch.from_numpy(X_all_scaled).float().to(DEVICE)
    recon_all_t = model_ae(X_all_t)
    sq_err_t = (X_all_t - recon_all_t) ** 2
    mse_all_t = sq_err_t.mean(dim=1)

    mse_all = mse_all_t.cpu().numpy()
    sq_err = sq_err_t.cpu().numpy()  

df["ae_recon_error_v11"] = mse_all
df["ae_log_recon_error_v11"] = np.log1p(mse_all)

# Z-score
log_err_clean = df.loc[df["target"] == 0, "ae_log_recon_error_v11"].values
mu_log = np.nanmean(log_err_clean)
sigma_log = np.nanstd(log_err_clean, ddof=1)
if sigma_log == 0 or not np.isfinite(sigma_log):
    sigma_log = 1.0

df["ae_z_recon_error_v11"] = (df["ae_log_recon_error_v11"] - mu_log) / (sigma_log + 1e-6)

df["ae_percentile_v11"] = df["ae_log_recon_error_v11"].rank(pct=True)

print("\nAE-анализ (reconstruction error) по всему датасету:")
print("  mu_log (normal):", mu_log)
print("  sigma_log (normal):", sigma_log)

feature_mse = sq_err.mean(axis=0)  # (n_features,)
ae_importance_df = pd.DataFrame({
    "feature": ae_feature_cols,
    "mse": feature_mse
}).sort_values("mse", ascending=False)

ae_importance_path = PROC_PATH / "autoencoder_feature_importance_v11.parquet"
ae_importance_df.to_parquet(ae_importance_path, index=False)
print(f"\nPer-feature AE MSE сохранён в {ae_importance_path}")
print(ae_importance_df.head(20))


y_true = df["target"].values.astype(int)
score_ae = df["ae_log_recon_error_v11"].values

roc_ae = roc_auc_score(y_true, score_ae)
pr_ae = average_precision_score(y_true, score_ae)
print("\n=== Метрики AE (по лог-ошибке) ===")
print("ROC-AUC (AE):", roc_ae)
print("PR-AUC  (AE):", pr_ae)
print("Baseline PR-AUC (random):", fraud_share)

q99 = np.quantile(df.loc[df["target"] == 0, "ae_log_recon_error_v11"], 0.99)
q999 = np.quantile(df.loc[df["target"] == 0, "ae_log_recon_error_v11"], 0.999)

print("\nКвантили лог-ошибки по нормальным:")
print("  99% quantile:", q99)
print("  99.9% quantile:", q999)


ae_model_path = MODELS_PATH / "tx_autoencoder_v11.pt"
torch.save(model_ae.state_dict(), ae_model_path)
print(f"\nAE-модель сохранена в {ae_model_path}")


ae_meta_features = [
    "ae_log_recon_error_v11",
    "ae_z_recon_error_v11",
    "amount",
    "log_amount",
    "z_amount_30d",
    "user_tx_1m",
    "user_tx_10m",
    "user_tx_60m",
    "user_sum_60m",
    "degree_cst",
    "degree_dir",
    "cst_fraud_share",
    "dir_fraud_share",
    "one_to_many_flag",
    "many_to_one_flag",
    "sess_logins_7d",
    "sess_logins_30d",
    "sess_logins_7d_30d_ratio",
    "sess_login_freq_7d",
    "sess_login_freq_30d",
    "sess_burstiness_login_interval",
    "sess_z_login_interval_7d",
    "hour",
    "dayofweek",
    "is_weekend",
]

missing_meta = [c for c in ae_meta_features if c not in df.columns]
if missing_meta:
    raise ValueError(f"Отсутствуют фичи для ae_meta_features: {missing_meta}")

df_meta = df[ae_meta_features].copy()
df_meta = df_meta.replace([np.inf, -np.inf], np.nan)
meta_medians = df_meta.median(axis=0)
df_meta = df_meta.fillna(meta_medians)

X_meta = df_meta.values.astype(np.float32)
y_meta = df["target"].values.astype(int)

print("\nРазмер X_meta:", X_meta.shape)
print("ae_meta_features:", ae_meta_features)

ae_meta_features_path = CONFIG_PATH / "autoencoder_meta_features_v11.json"
with ae_meta_features_path.open("w", encoding="utf-8") as f:
    json.dump(ae_meta_features, f, ensure_ascii=False, indent=2)
print(f"Список фич мета-модели сохранён в {ae_meta_features_path}")


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_pred = np.zeros(len(df), dtype=np.float32)
fold_metrics = []

print("\n================ OOF-обучение risk_ae (LogisticRegression) ================")

for fold, (tr_idx, val_idx) in enumerate(skf.split(X_meta, y_meta), start=1):
    X_tr, y_tr = X_meta[tr_idx], y_meta[tr_idx]
    X_val, y_val = X_meta[val_idx], y_meta[val_idx]

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(
            class_weight="balanced",
            C=0.5,
            max_iter=2000,
            solver="lbfgs",
            n_jobs=-1,
        )),
    ])

    print(f"\n=== Fold {fold}/5 ===")
    print(f"  fold pos={y_tr.sum()}, neg={len(y_tr) - y_tr.sum()}")

    pipe.fit(X_tr, y_tr)
    val_proba = pipe.predict_proba(X_val)[:, 1]
    oof_pred[val_idx] = val_proba

    fold_roc = roc_auc_score(y_val, val_proba)
    fold_pr = average_precision_score(y_val, val_proba)
    fold_metrics.append((fold_roc, fold_pr))

    print(f"  Fold ROC-AUC={fold_roc:.4f}, PR-AUC={fold_pr:.4f}")

roc_oof = roc_auc_score(y_meta, oof_pred)
pr_oof = average_precision_score(y_meta, oof_pred)

print("\n=== CV по фолдам для risk_ae ===")
for i, (r, p) in enumerate(fold_metrics, start=1):
    print(f"  Fold {i}: ROC-AUC={r:.4f}, PR-AUC={p:.4f}")
print(f"\nROC-AUC (OOF): {roc_oof:.4f}")
print(f"PR-AUC  (OOF): {pr_oof:.4f}")
print(f"Baseline PR-AUC (random): {fraud_share:.6f}")

df["risk_ae_oof_v11"] = oof_pred


def compute_best_thresholds(score, y_true):
    thresholds = np.linspace(score.min(), score.max(), 200)

    best_f1 = -1.0
    best_thr = None
    best_prec = 0.0
    best_rec = 0.0

    stats = []

    for thr in thresholds:
        pred = (score >= thr).astype(int)
        prec, rec, f1, _ = precision_recall_fscore_support(
            y_true, pred, average="binary", zero_division=0
        )
        stats.append((thr, prec, rec, f1))
        if f1 > best_f1:
            best_f1 = f1
            best_thr = thr
            best_prec = prec
            best_rec = rec

    
    best_aggr = None
    best_aggr_score = -1.0
    for thr, prec, rec, f1 in stats:
        if rec >= 0.95:
            score_aggr = f1 + 0.1 * prec
            if score_aggr > best_aggr_score:
                best_aggr_score = score_aggr
                best_aggr = (thr, prec, rec, f1)
    if best_aggr is None:
        best_aggr = max(stats, key=lambda x: x[2])

    # Friendly
    best_friendly = None
    best_friend_score = -1.0
    for thr, prec, rec, f1 in stats:
        score_friend = prec + 0.1 * rec
        if score_friend > best_friend_score:
            best_friend_score = score_friend
            best_friendly = (thr, prec, rec, f1)

    thr_bal, p_bal, r_bal, f1_bal = best_thr, best_prec, best_rec, best_f1
    thr_aggr, p_aggr, r_aggr, f_aggr = best_aggr
    thr_friend, p_friend, r_friend, f_friend = best_friendly

    return {
        "balanced": {
            "threshold": float(thr_bal),
            "precision": float(p_bal),
            "recall": float(r_bal),
            "f1": float(f1_bal),
        },
        "aggressive": {
            "threshold": float(thr_aggr),
            "precision": float(p_aggr),
            "recall": float(r_aggr),
            "f1": float(f_aggr),
        },
        "friendly": {
            "threshold": float(thr_friend),
            "precision": float(p_friend),
            "recall": float(r_friend),
            "f1": float(f_friend),
        },
    }

thr_info = compute_best_thresholds(oof_pred, y_meta)

print("\n=== Пороги стратегий по OOF для risk_ae ===")
for name, stat in thr_info.items():
    print(
        f"{name.capitalize()}: "
        f"threshold={stat['threshold']:.3f}, "
        f"precision={stat['precision']:.3f}, "
        f"recall={stat['recall']:.3f}, "
        f"f1={stat['f1']:.3f}"
    )


final_meta_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(
        class_weight="balanced",
        C=0.5,
        max_iter=2000,
        solver="lbfgs",
        n_jobs=-1,
    )),
])
final_meta_pipe.fit(X_meta, y_meta)
risk_full = final_meta_pipe.predict_proba(X_meta)[:, 1]
df["risk_ae_v11"] = risk_full

meta_model_path = MODELS_PATH / "tx_autoencoder_meta_v11.pkl"
joblib.dump(final_meta_pipe, meta_model_path)
print(f"\nМета-модель risk_ae сохранена в {meta_model_path}")


thresholds_config = {
    "anomaly_score": {
        "log_error_q99": float(q99),
        "log_error_q999": float(q999),
        "mu_log_normal": float(mu_log),
        "sigma_log_normal": float(sigma_log),
    },
    "risk_ae": thr_info,
    "oof_metrics": {
        "roc_auc": float(roc_oof),
        "pr_auc": float(pr_oof),
        "fraud_share": float(fraud_share),
    },
    "ae_metrics": {
        "roc_auc": float(roc_ae),
        "pr_auc": float(pr_ae),
    },
}

thr_config_path = CONFIG_PATH / "autoencoder_thresholds_v11.json"
with thr_config_path.open("w", encoding="utf-8") as f:
    json.dump(thresholds_config, f, ensure_ascii=False, indent=2)

print(f"\nПороги AE/risk_ae сохранены в {thr_config_path}")


df.to_parquet(FEATURES_PARQUET, index=False)
print(f"\n✅ Обновлённый feature store с AE и risk_ae сохранён в {FEATURES_PARQUET}")

print("\nГотово: Autoencoder v11 + risk_ae_v11 построены и сохранены.")


DEVICE: cuda
Пробуем прочитать Parquet: data/processed/features_offline_v11.parquet
✅ Успешно прочитан как Parquet.
Всего строк: 13113
Колонок: 193
Первые колонки: ['transdatetime', 'cst_dim_id', 'transdate', 'amount', 'docno', 'direction', 'target', 'row_id', 'sess_monthly_os_changes', 'sess_monthly_phone_model_changes', 'sess_logins_7d', 'sess_logins_30d', 'sess_login_freq_7d', 'sess_login_freq_30d', 'sess_freq_change_7d_vs_mean', 'sess_logins_7d_30d_ratio', 'sess_avg_login_interval_30d', 'sess_std_login_interval_30d', 'sess_var_login_interval_30d', 'sess_ewm_login_interval_7d', 'sess_burstiness_login_interval', 'sess_fano_login_interval', 'sess_z_login_interval_7d', 'sess_has_login_history', 'sess_last_phone_model'] ...

Распределение target по всему датасету:
target
0    12948
1      165
Name: count, dtype: int64
Доля фрода: 0.012582932967284374

Размеры групп фич:
  base_cols: 34
  graph_cols: 6
  sess_cols: 18
  emb_cst_cols: 64
  emb_dir_cols: 64

Размерность варианта F_full_wit