## 01. Library import & paths

**Purpose**: Import libs, set reproducibility, define project paths, and prepare artifacts/reports folders.

In [25]:
import os
import sys
import json
import time
import gc
import warnings
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Clean logs
warnings.filterwarnings("ignore", category=FutureWarning, module="pyarrow")

# Pandas display (debug convenience)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

# Version snapshot (for run metadata)
def lib_versions() -> Dict[str, str]:
    return {
        "pandas": pd.__version__,
        "numpy": np.__version__,
        "pyarrow": pa.__version__,
    }

print(f"pandas: {pd.__version__} | numpy: {np.__version__} | pyarrow: {pa.__version__}")

# Project structure (support running from notebooks/ or project root)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
SRC_DIR      = (PROJECT_ROOT / "src").resolve()
DATA_DIR     = (PROJECT_ROOT / "data").resolve()
ARTIFACTS_DIR= (PROJECT_ROOT / "artifacts").resolve()
REPORTS_DIR  = (PROJECT_ROOT / "reports").resolve()
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Ensure src is importable (for features_extended.py)
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Artifacts (v1 & v3)
FINAL_DS_V1_PATH = ARTIFACTS_DIR / "final_dataset.parquet"   # produced by 03
FINAL_DS_V2_PATH = ARTIFACTS_DIR / "final_dataset_v2.parquet" # produced by 03.1
FINAL_DS_V3_PATH = ARTIFACTS_DIR / "final_dataset_v3.parquet"   # this notebook output
META_V3_JSON     = REPORTS_DIR   / "final_v3_meta.json"         # run metadata
FEATURE_LIST_V3  = REPORTS_DIR   / "final_v3_feature_list.json" # feature inventory

print("ENV OK")

pandas: 2.2.2 | numpy: 1.26.4 | pyarrow: 21.0.0
ENV OK


## 02. Load base final v1

**Purpose**: Load the existing final dataset (v1) as a baseline input, apply light downcasting, and set run-specific report paths.

In [26]:
from importlib import import_module

# Align report filenames with your existing convention in /reports
META_V2_JSON    = REPORTS_DIR / "03_1_final_dataset_v2_report.json"   # will be created later
FEATURE_LIST_V2 = REPORTS_DIR / "final_v2_feature_list.json"          # will be created later

# Optional safe reader from src.utils (falls back to pandas if not available)
def read_parquet_safe(path: Path) -> pd.DataFrame:
    try:
        m = import_module("utils")
        if hasattr(m, "read_parquet_safe"):
            return m.read_parquet_safe(path)
    except Exception:
        pass
    return pd.read_parquet(path)

t0 = time.time()
df_base = read_parquet_safe(FINAL_DS_V1_PATH)
print(f"Loaded v1 dataset: {df_base.shape} in {time.time()-t0:.1f}s")

# Light numeric downcast to save RAM
num_cols = df_base.select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
for c in num_cols:
    if pd.api.types.is_float_dtype(df_base[c]):
        df_base[c] = pd.to_numeric(df_base[c], downcast="float")
    else:
        df_base[c] = pd.to_numeric(df_base[c], downcast="integer")

TARGET   = "target"
ID_COLS  = ["id", "rn"]

print("Downcasted. Columns:", df_base.shape[1], "| Target:", TARGET, "| IDs:", ID_COLS)
gc.collect()

Loaded v1 dataset: (3000000, 45) in 0.4s
Downcasted. Columns: 45 | Target: target | IDs: ['id', 'rn']


2016

## 03. Init FeatureGeneratorExtended (config)

**Purpose**: Configure and initialize the extended feature generator; prepare payment matrix and basic settings.

In [27]:
import importlib
import utils.features as f
import utils.features_extended as fe
importlib.reload(f)
importlib.reload(fe)

from utils.features_extended import FeatureConfigExtended, FeatureGeneratorExtended

In [28]:
cfg = FeatureConfigExtended(verbose=True)
fg  = FeatureGeneratorExtended(cfg)
print(type(cfg).__name__, type(fg).__name__)

FeatureConfigExtended FeatureGeneratorExtended


In [29]:
from utils.features_extended import FeatureConfigExtended, FeatureGeneratorExtended

# Detect payment columns (enc_paym_0 is the most recent by project convention)
PAYM_COLS = sorted([c for c in df_base.columns if c.startswith("enc_paym_")],
                   key=lambda s: int(s.split("_")[-1]))  # enc_paym_0, enc_paym_1, ...

paym_df = df_base[PAYM_COLS].copy() if len(PAYM_COLS) else pd.DataFrame(index=df_base.index)

# Configure which groups to enable; tweak windows/caps if needed
cfg_ext = FeatureConfigExtended(
    use_payment_seq=True,
    use_ratios=True,
    use_bucket_severity=True,
    use_interactions=True,
    windows=[3, 6, 12, 24],
    cap_outliers=True,
    cap_bounds={"_default": (1, 99)},  # per-column overrides can be added later
    eps=1e-4,
    # Payment code mapping (adjust if your scheme differs)
    paym_ok_values=(0, 1),
    paym_late_values=(2, 3, 4, 5, 6, 7, 8, 9),
    verbose=True,
)

fg_ext = FeatureGeneratorExtended(cfg_ext)

print(f"PAYM_COLS: {len(PAYM_COLS)} | df_base: {df_base.shape} | paym_df: {paym_df.shape}")

PAYM_COLS: 25 | df_base: (3000000, 45) | paym_df: (3000000, 25)


## 04. Generate extended features

**Purpose**: Build extended features using FeatureGeneratorExtended and assemble v2 dataset (id, rn, features, target).

In [30]:
t_start = time.time()

# Keep meta/target
ID_COLS = ["id", "rn"]
TARGET  = "target"

# Separate payment matrix (already prepared as paym_df) and pass the full base frame to the generator
X_in  = df_base.copy()
X_ext = fg_ext.transform(X_in, paym_df)  # adds extended features on a copy

# Ensure target and ids are present and first in order
cols_order = ID_COLS + [TARGET] + [c for c in X_ext.columns if c not in ID_COLS + [TARGET]]
df_v2 = X_ext[cols_order].copy()

print(f"Extended features built: +{df_v2.shape[1] - df_base.shape[1]} columns "
      f"(total {df_v2.shape[1]}) in {time.time()-t_start:.1f}s")

# Memory relief
del X_in, X_ext
gc.collect()

[just] paym_ok_share_3: Share of OK payments within last 3 periods.
[just] paym_late_share_3: Share of late payments within last 3 periods.
[just] paym_ok_share_6: Share of OK payments within last 6 periods.
[just] paym_late_share_6: Share of late payments within last 6 periods.
[just] paym_ok_share_12: Share of OK payments within last 12 periods.
[just] paym_late_share_12: Share of late payments within last 12 periods.
[just] paym_ok_share_24: Share of OK payments within last 24 periods.
[just] paym_late_share_24: Share of late payments within last 24 periods.
[just] paym_longest_ok_streak_24: Longest consecutive OK streak within ~24 periods.
[just] paym_longest_late_streak_24: Longest consecutive LATE streak within ~24 periods.
[just] paym_last_late_recency: Recency of the last late event (0=now, 1=prev, NaN=never).
[just] paym_last_ok_recency: Recency of the last OK event (0=now, 1=prev, NaN=never).
[just] paym_ok_trend_6: Slope of OK share over last 6 periods (trend of discipline).

100

## 05. Init FeatureGeneratorExtendedV2 (config)

**Purpose**: Configure and initialize the extended feature generator v2; prepare payment matrix and basic settings.

In [31]:
import importlib
import utils.features as f
import utils.features_extended_v2 as fe

importlib.reload(f)
importlib.reload(fe)

from utils.features_extended_v2 import (
    FeatureConfigExtendedV2,
    FeatureGeneratorExtendedV2,
    sanitize_dtypes
)

In [32]:
cfg = FeatureConfigExtendedV2(verbose=True)
fg  = FeatureGeneratorExtendedV2(cfg)
print(type(cfg).__name__, type(fg).__name__)

FeatureConfigExtendedV2 FeatureGeneratorExtendedV2


In [58]:
# Sanitize dtypes once (pyarrow/bool → numpy)
df_v2 = sanitize_dtypes(df_v2)

# Detect payment columns (enc_paym_0 is the most recent by project convention)
PAYM_COLS = sorted(
    [c for c in df_v2.columns if c.startswith("enc_paym_")],
    key=lambda s: int(s.split("_")[-1]) if s.split("_")[-1].isdigit() else 10**9
)  # enc_paym_0, enc_paym_1, ...

paym_df = df_v2[PAYM_COLS].copy() if len(PAYM_COLS) else pd.DataFrame(index=df_v2.index)

# Configure which groups to enable; tweak windows/caps if needed
cfg_ext = FeatureConfigExtendedV2(
    # базовые
    use_payment_seq=True,
    use_payment_transitions=True,
    use_ratios=True,
    use_outstanding_ratios=True,
    use_bucket_severity=True,
    use_interactions=True,
    use_zero_aggregates=True,
    use_logs=True,
    use_time_decay=True,

    windows=[3, 6, 12, 24],
    cap_outliers=True,
    cap_bounds={"_default": (1, 99)},
    eps=1e-4,

    # МНОВЕДЁМ НОВОЕ:
    use_momentum=True,
    use_cross_ratios=True,
    use_behavioral_flags=True,
    use_age_exposure=True,
    use_interaction_grid=True,

    # multi-decay (добавит несколько версий decay-скора)
    time_decay=0.88,               # legacy-скаляр (оставляем)
    time_decays=[0.88, 0.95, 0.70],# новые множественные

    # momentum окна
    momentum_windows=[6, 12, 24],

    # хотим дискретизацию util? ставим >1 (0 — выкл)
    risk_band_bins=10,              # например, 10 чтобы включить квантильные бины

    # карты статусов (оставь как есть)
    paym_ok_values=(0, 1),
    paym_late_values=(2, 3, 4, 5, 6, 7, 8, 9),

    verbose=True,
)

fg_ext = FeatureGeneratorExtendedV2(cfg_ext)

print(f"PAYM_COLS: {len(PAYM_COLS)}  |  df_v2: {df_v2.shape}  |  paym_df: {paym_df.shape}")

PAYM_COLS: 25  |  df_v2: (3000000, 63)  |  paym_df: (3000000, 25)


In [34]:
df_v2.columns

Index(['id', 'rn', 'target', 'enc_paym_0', 'enc_paym_1', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13',
       'enc_paym_14', 'enc_paym_15', 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_2',
       'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_paym_3', 'enc_paym_4',
       'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'pre_fterm', 'pre_loans3060',
       'pre_loans5', 'pre_loans530', 'pre_loans6090', 'pre_loans90', 'pre_loans_credit_limit',
       'pre_loans_max_overdue_sum', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_pterm',
       'pre_since_confirmed', 'pre_since_opened', 'pre_till_fclose', 'pre_till_pclose', 'paym_last_status',
       'paym_last_clean_streak', 'paym_ok_share_3', 'paym_late_share_3', 'paym_ok_share_6', 'paym_late_share_6',
       'paym_ok_share_12', 'paym_late_share_12', 'paym_ok_share_24', 'paym_late_share_24', 'paym_longest_ok_streak_24',
       'paym_longe

In [35]:
paym_df.columns

Index(['enc_paym_0', 'enc_paym_1', 'enc_paym_2', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13', 'enc_paym_14',
       'enc_paym_15', 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20', 'enc_paym_21',
       'enc_paym_22', 'enc_paym_23', 'enc_paym_24'],
      dtype='object')

In [36]:
df_base.columns

Index(['id', 'enc_paym_0', 'enc_paym_1', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13', 'enc_paym_14',
       'enc_paym_15', 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_2', 'enc_paym_20',
       'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_paym_3', 'enc_paym_4', 'enc_paym_5',
       'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'pre_fterm', 'pre_loans3060', 'pre_loans5',
       'pre_loans530', 'pre_loans6090', 'pre_loans90', 'pre_loans_credit_limit', 'pre_loans_max_overdue_sum',
       'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_pterm', 'pre_since_confirmed', 'pre_since_opened',
       'pre_till_fclose', 'pre_till_pclose', 'rn', 'paym_last_status', 'paym_last_clean_streak', 'target'],
      dtype='object')

## 06. Generate extended features V2

**Purpose**: Apply the new `FeatureGeneratorExtendedV2` to the enriched dataset (`df_v2`), build second-level features (ratios, transitions, logs, time-decay, interactions), and assemble the final dataset version `v3`.

In [59]:
# Build extended features (V2) using upgraded generator with proxy utilities
t_start = time.time()

# --- IDs & target ---
ID_COLS = ["id", "rn"]
TARGET  = "target"

# --- Transform df_v2 with FeatureGeneratorExtendedV2 ---
X_in  = df_v2.copy()
X_ext = fg_ext.transform(X_in, paym_df)   # adds new V2 features

# --- Ensure proper column order ---
cols_order = ID_COLS + [TARGET] + [c for c in X_ext.columns if c not in ID_COLS + [TARGET]]
df_v3 = X_ext[cols_order].copy()

print(f"V2 → V3: +{df_v3.shape[1] - df_v2.shape[1]} columns "
      f"(total {df_v3.shape[1]}) in {time.time() - t_start:.1f}s")

# --- Memory cleanup ---
del X_in, X_ext
gc.collect()

[just] paym_ok_share_3: Share of OK payments within last 3 periods.
[just] paym_late_share_3: Share of late payments within last 3 periods.
[just] paym_ok_share_6: Share of OK payments within last 6 periods.
[just] paym_late_share_6: Share of late payments within last 6 periods.
[just] paym_ok_share_12: Share of OK payments within last 12 periods.
[just] paym_late_share_12: Share of late payments within last 12 periods.
[just] paym_ok_share_24: Share of OK payments within last 24 periods.
[just] paym_late_share_24: Share of late payments within last 24 periods.
[just] paym_longest_ok_streak_24: Longest consecutive OK streak within ~24 periods.
[just] paym_longest_late_streak_24: Longest consecutive LATE streak within ~24 periods.
[just] paym_last_late_recency: Recency of the last late event (0=now, 1=prev, NaN=never).
[just] paym_last_ok_recency: Recency of the last OK event (0=now, 1=prev, NaN=never).
[just] paym_ok_trend_6: Slope of OK share over last 6 periods.
[just] paym_transitio

118

In [49]:
df_v3.columns

Index(['id', 'rn', 'target', 'enc_paym_0', 'enc_paym_1', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12', 'enc_paym_13',
       'enc_paym_14', 'enc_paym_15', 'enc_paym_16', 'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_2',
       'enc_paym_20', 'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24', 'enc_paym_3', 'enc_paym_4',
       'enc_paym_5', 'enc_paym_6', 'enc_paym_7', 'enc_paym_8', 'enc_paym_9', 'pre_fterm', 'pre_loans3060',
       'pre_loans5', 'pre_loans530', 'pre_loans6090', 'pre_loans90', 'pre_loans_credit_limit',
       'pre_loans_max_overdue_sum', 'pre_loans_outstanding', 'pre_loans_total_overdue', 'pre_pterm',
       'pre_since_confirmed', 'pre_since_opened', 'pre_till_fclose', 'pre_till_pclose', 'paym_last_status',
       'paym_last_clean_streak', 'paym_ok_share_3', 'paym_late_share_3', 'paym_ok_share_6', 'paym_late_share_6',
       'paym_ok_share_12', 'paym_late_share_12', 'paym_ok_share_24', 'paym_late_share_24', 'paym_longest_ok_streak_24',
       'paym_longe

## 07. Quick post-check

**Purpose**: Perform light validation of the v3 dataset — shape, NaN rate, duplicates, class balance, and feature overview.

In [60]:
def quick_postcheck(df: pd.DataFrame, target_col: str = "target") -> Dict[str, Any]:
    """Compute quick validation summary for the dataset."""
    report = {}
    report["shape"] = df.shape
    report["columns"] = len(df.columns)
    report["nans_total"] = int(df.isna().sum().sum())
    report["nan_rate_pct"] = float(df.isna().mean().mean() * 100)

    # Duplicates by id+rn
    if all(col in df.columns for col in ["id", "rn"]):
        dup_count = df.duplicated(subset=["id", "rn"]).sum()
    else:
        dup_count = df.duplicated().sum()
    report["duplicates"] = int(dup_count)

    # Target distribution
    if target_col in df.columns:
        vc = df[target_col].value_counts(dropna=False, normalize=True)
        report["target_dist"] = {int(k): float(v) for k, v in vc.items()}

    # Null-rate per column (top 10)
    nulls = df.isna().mean().sort_values(ascending=False).head(10).to_dict()
    report["top10_null_cols"] = {k: round(v * 100, 2) for k, v in nulls.items()}

    return report

qc_report = quick_postcheck(df_v3, target_col=TARGET)

print(f"Dataset v3 shape: {qc_report['shape']}")
print(f"Total NaNs: {qc_report['nans_total']} ({qc_report['nan_rate_pct']:.2f}%)")
print(f"Duplicates by id+rn: {qc_report['duplicates']}")
print("Target distribution:", qc_report.get("target_dist", {}))
print("Top 10 NaN-rate columns (%):", qc_report["top10_null_cols"])

# Save quick summary to report JSON
quick_report_path = REPORTS_DIR / "03_2_final_dataset_v3_quickcheck.json"
with open(quick_report_path, "w", encoding="utf-8") as f:
    json.dump(qc_report, f, ensure_ascii=False, indent=2)

print("Saved:", quick_report_path)

Dataset v3 shape: (3000000, 100)
Total NaNs: 434943 (0.14%)
Duplicates by id+rn: 0
Target distribution: {0: 0.9645193333333333, 1: 0.03548066666666667}
Top 10 NaN-rate columns (%): {'paym_last_ok_recency': 7.76, 'recency_x_total_overdue': 3.37, 'paym_last_late_recency': 3.37, 'id': 0.0, 'paym_transitions_3': 0.0, 'paym_transitions_24': 0.0, 'paym_any_late_12': 0.0, 'paym_late_std_12': 0.0, 'paym_transitions_12': 0.0, 'paym_any_late_6': 0.0}
Saved: D:\final_v2\credit-risk-management\reports\03_2_final_dataset_v3_quickcheck.json


## 08. Extended checks

**Purpose**: Run extended validation — correlation snapshot, identical columns scan, and basic schema metadata (safe & sampled).

In [61]:
from hashlib import md5

# Config for safe computation on large frames
EXT_SAMPLE_ROWS = 200_000   # sample for correlation to avoid OOM
TOP_K_CORR_PAIRS = 30       # how many top correlated pairs to keep
CORR_THRESHOLD = 0.98       # flag pairs with |corr| >= threshold

def sample_df(df: pd.DataFrame, n: int) -> pd.DataFrame:
    if len(df) <= n:
        return df
    return df.sample(n=n, random_state=SEED).reset_index(drop=True)

def top_correlated_pairs(df: pd.DataFrame, k: int, thr: float) -> List[Dict[str, Any]]:
    num_cols = df.select_dtypes(include=["number"]).columns.tolist()
    if not num_cols:
        return []
    df_s = sample_df(df[num_cols], EXT_SAMPLE_ROWS)
    corr = df_s.corr(numeric_only=True).abs()
    # extract upper triangle without diagonal
    pairs = []
    cols = corr.columns.tolist()
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            c = corr.iat[i, j]
            if not np.isnan(c):
                pairs.append((cols[i], cols[j], float(c)))
    pairs.sort(key=lambda x: x[2], reverse=True)
    out = []
    for a, b, v in pairs[:k]:
        out.append({"col_a": a, "col_b": b, "abs_corr": v, "flag_high": v >= thr})
    return out

def find_identical_columns(df: pd.DataFrame, ignore_cols: List[str]) -> List[List[str]]:
    """Group columns that are byte-identical (after casting NaN to a sentinel)."""
    cols = [c for c in df.columns if c not in ignore_cols]
    sig2cols: Dict[str, List[str]] = {}
    for c in cols:
        s = df[c]
        # normalize to bytes signature
        vals = s.fillna(np.nan).to_numpy()
        try:
            data = vals.tobytes()
        except Exception:
            # fallback: convert to string (slower but safe)
            data = "|".join(map(str, s.fillna("NaN").tolist())).encode("utf-8")
        key = md5(data).hexdigest()
        sig2cols.setdefault(key, []).append(c)
    # keep only groups with 2+ members
    return [v for v in sig2cols.values() if len(v) >= 2]

ext_report: Dict[str, Any] = {}

# Basic schema meta
ext_report["n_rows"] = int(len(df_v3))
ext_report["n_cols"] = int(df_v3.shape[1])
ext_report["numeric_cols"] = int(df_v3.select_dtypes(include=["number"]).shape[1])
ext_report["object_cols"]  = int(df_v3.select_dtypes(include=["object"]).shape[1])

# Correlation snapshot (safe sample)
t0 = time.time()
ext_report["top_corr_pairs"] = top_correlated_pairs(df_v3.drop(columns=["id","rn","target"], errors="ignore"),
                                                    k=TOP_K_CORR_PAIRS, thr=CORR_THRESHOLD)
ext_report["corr_snapshot_time_sec"] = round(time.time() - t0, 2)

# Identical columns (byte-level)
t0 = time.time()
ident_groups = find_identical_columns(df_v3.drop(columns=["id","rn","target"], errors="ignore"),
                                      ignore_cols=[])
ext_report["identical_column_groups"] = ident_groups[:20]  # limit in report
ext_report["identical_scan_time_sec"] = round(time.time() - t0, 2)

# Save extended report
ext_report_path = REPORTS_DIR / "03_2_final_dataset_v3_extcheck.json"
with open(ext_report_path, "w", encoding="utf-8") as f:
    json.dump(ext_report, f, ensure_ascii=False, indent=2)

print(f"Extended checks saved → {ext_report_path}")
print(f"Top {len(ext_report['top_corr_pairs'])} corr pairs (abs):",
      [(p['col_a'], p['col_b'], round(p['abs_corr'],4)) for p in ext_report['top_corr_pairs'][:5]])
print(f"Identical groups found: {len(ident_groups)} (showing up to 20 in report)")

gc.collect()

Extended checks saved → D:\final_v2\credit-risk-management\reports\03_2_final_dataset_v3_extcheck.json
Top 30 corr pairs (abs): [('paym_ok_share_3', 'paym_late_share_3', 1.0), ('paym_ok_share_24', 'paym_late_share_24', 1.0), ('paym_ok_share_delta_6_12', 'paym_late_momentum_12', 1.0), ('outstanding_to_limit', 'credit_headroom', 1.0), ('enc_paym_0', 'paym_last_status', 1.0)]
Identical groups found: 3 (showing up to 20 in report)


33

## 09. Save dataset v3 (+ meta & feature inventory)

**Purpose**: Apply sentinel imputation, prune redundant features, and persist v3 dataset with metadata & feature list.

In [62]:
# --- Sentinel imputation for recency features (v3) ---
RECENCY_SENTINEL = 25
for col in ["paym_last_late_recency", "paym_last_ok_recency"]:
    if col in df_v3.columns:
        df_v3[col] = df_v3[col].fillna(RECENCY_SENTINEL).astype("float32")

# --- Feature pruning (v3) ---
drop_cols = []

# 1) Антикоррелирующие shares: сносим OK-shares полностью
drop_cols += [c for c in df_v3.columns if c.startswith("paym_ok_share_")]

# 2) Оставляем компактный поднабор поздних share/стрик/трендов/recency (как в 03.1)
keep_late_shares = {"paym_late_share_6", "paym_late_share_24"}
keep_streaks     = {"paym_longest_late_streak_24"}
keep_trend       = {"paym_ok_trend_6"}
keep_recency     = {"paym_last_late_recency"}

# 3) Удаляем все paym_* в этих группах, кроме keep_set
keep_set = keep_late_shares | keep_streaks | keep_trend | keep_recency
drop_cols += [
    c for c in df_v3.columns
    if (
        c.startswith("paym_late_share_")
        or c.startswith("paym_longest_")
        or c.startswith("paym_ok_trend_")
        or c.startswith("paym_last_")
    ) and c not in keep_set
]

# 4) Дубликаты/идентичные колонки по отчёту ext-check → удаляем util_x_overdue
if "util_x_overdue" in df_v3.columns:
    drop_cols.append("util_x_overdue")   # identical to overdue_to_limit (see ext-check)

# 5) Сырые enc_paym_*: оставляем первые три
paym_raw_cols = sorted([c for c in df_v3.columns if c.startswith("enc_paym_")],
                       key=lambda s: int(s.split("_")[-1]))
drop_cols += paym_raw_cols[3:]  # keep enc_paym_0..2

# 6) Применяем с учётом новых групп (НЕ трогаем transitions/std/any_late/decay/deltas/ratios/logs/util_x_outstanding)
before_cols = df_v3.shape[1]
df_v3.drop(columns=list(set(drop_cols)), inplace=True, errors="ignore")
after_cols = df_v3.shape[1]
print(f"Pruned {before_cols - after_cols} redundant features; {after_cols} remain.")

Pruned 35 redundant features; 65 remain.


In [63]:
# --- Save dataset v3 ---
t0 = time.time()
df_v3.to_parquet(FINAL_DS_V3_PATH, index=False)
save_time = round(time.time() - t0, 2)
print(f"Saved {FINAL_DS_V3_PATH.name} | {df_v3.shape} in {save_time}s")

# --- Save feature inventory (v3) ---
feature_list = [c for c in df_v3.columns if c not in ["id", "rn", "target"]]
with open(FEATURE_LIST_V3, "w", encoding="utf-8") as f:
    json.dump(sorted(feature_list), f, ensure_ascii=False, indent=2)
print(f"Saved feature list: {FEATURE_LIST_V3}")

# --- Save metadata report (v3) ---
meta = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "seed": SEED,
    "n_rows": int(len(df_v3)),
    "n_features": len(feature_list),
    "save_time_sec": save_time,
    "removed_features": sorted(list(set(drop_cols))),
    "recency_sentinel": RECENCY_SENTINEL,
    "lib_versions": lib_versions(),
}
with open(META_V3_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"Saved meta report: {META_V3_JSON}")

Saved final_dataset_v3.parquet | (3000000, 65) in 4.95s
Saved feature list: D:\final_v2\credit-risk-management\reports\final_v3_feature_list.json
Saved meta report: D:\final_v2\credit-risk-management\reports\final_v3_meta.json


## 10. Changelog: v1 vs v2 feature diff

**Purpose**: compare feature inventory between v2 and v3 datasets and generate a lightweight changelog report (counts, lists).

In [64]:
def load_v2_columns(path: Path) -> List[str]:
    df = pd.read_parquet(path, columns=None)
    return df.columns.tolist()

# Load v2 columns (light operation)
v2_cols = load_v2_columns(FINAL_DS_V2_PATH)
v3_cols = df_v3.columns.tolist()

# Keep only feature columns (exclude id/rn/target)
def feat_only(cols: List[str]) -> List[str]:
    return [c for c in cols if c not in ("id", "rn", "target")]

v2_feats = set(feat_only(v2_cols))
v3_feats = set(feat_only(v3_cols))

added    = sorted(v3_feats - v2_feats)
removed  = sorted(v2_feats - v3_feats)
common   = sorted(v2_feats & v3_feats)

changelog_v3 = {
    "v2_feature_count": len(v2_feats),
    "v3_feature_count": len(v3_feats),
    "added_count": len(added),
    "removed_count": len(removed),
    "common_count": len(common),
    "added": added[:200],
    "removed": removed[:200],
}

change_path_v3 = REPORTS_DIR / "final_v3_changelog.json"
with open(change_path_v3, "w", encoding="utf-8") as f:
    json.dump(changelog_v3, f, ensure_ascii=False, indent=2)

print(f"Saved changelog → {change_path_v3}")
print(f"v2→v3: +{len(added)} / -{len(removed)} | common={len(common)}")

Saved changelog → D:\final_v2\credit-risk-management\reports\final_v3_changelog.json
v2→v3: +34 / -0 | common=28


## 11. Sanity importance (light RandomForest)

**Purpose**: Train a lightweight RandomForest on a sampled subset to estimate relative feature importances (quick signal check).

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

t0 = time.time()

SAMPLE_SIZE = 200_000 if len(df_v3) > 200_000 else len(df_v3)
df_sample = df_v3.sample(SAMPLE_SIZE, random_state=SEED)

X = df_sample.drop(columns=["id", "rn", "target"], errors="ignore")
y = df_sample["target"].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=SEED
)

rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=8,
    n_jobs=-1,
    class_weight="balanced",
    random_state=SEED,
)
rf.fit(X_train, y_train)
pred = rf.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, pred)

imp = (
    pd.Series(rf.feature_importances_, index=X.columns)
      .sort_values(ascending=False)
      .head(20)
      .round(4)
)

# save
imp_path = REPORTS_DIR / "final_v3_feature_importance_rf_sanity.json"
imp.to_json(imp_path, indent=2)

print(f"[Sanity RF] AUC={auc:.4f} | Features: {X.shape[1]} | Sample: {SAMPLE_SIZE} | Saved top-20 → {imp_path.name}")
print(imp)
gc.collect()

[Sanity RF] AUC=0.6449 | Features: 62 | Sample: 200000 | Saved top-20 → final_v3_feature_importance_rf_sanity.json
enc_paym_1                          0.0835
enc_paym_2                          0.0521
enc_paym_0                          0.0501
paym_late_time_decay_07             0.0403
paym_last_late_recency              0.0383
pre_loans_max_overdue_sum           0.0327
paym_late_time_decay                0.0327
maxover_to_limit                    0.0306
paym_transitions_24                 0.0299
outstanding_to_limit                0.0275
util_log1p                          0.0266
util_x_outstanding                  0.0264
pre_since_opened                    0.0257
age_since_opened_minus_confirmed    0.0256
paym_any_late_3                     0.0256
credit_headroom                     0.0255
paym_late_time_decay_088            0.0253
late6_x_util                        0.0253
pre_pterm                           0.0250
maxover_to_outstanding              0.0250
dtype: float64


87

In [66]:
# Purpose: Run remove-one-group ablation on df_v3 to see which new groups matter for AUC (same sample/split across runs).

import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# --- fixed sample/split for comparability ---
SAMPLE_SIZE = 200_000 if len(df_v3) > 200_000 else len(df_v3)
df_s = df_v3.sample(SAMPLE_SIZE, random_state=SEED)
X_full = df_s.drop(columns=["id","rn","target"], errors="ignore")
y_full = df_s["target"].astype(int)
X_tr, X_va, y_tr, y_va = train_test_split(X_full, y_full, test_size=0.25, stratify=y_full, random_state=SEED)

def rf_auc(X_train, X_valid, y_train, y_valid):
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1, class_weight="balanced", random_state=SEED)
    rf.fit(X_train, y_train)
    return roc_auc_score(y_valid, rf.predict_proba(X_valid)[:,1])

# --- define groups by patterns (v3 add-ons) ---
groups = {
    "transitions":       [c for c in X_full.columns if re.match(r"^paym_transitions_\d+$", c)],
    "late_std":          [c for c in X_full.columns if re.match(r"^paym_late_std_\d+$", c)],
    "any_late":          [c for c in X_full.columns if re.match(r"^paym_any_late_\d+$", c)],
    "time_decay":        [c for c in X_full.columns if c == "paym_late_time_decay"],
    "ok_share_deltas":   [c for c in X_full.columns if re.match(r"^paym_ok_share_delta_\d+_\d+$", c)],
    "logs":              [c for c in X_full.columns if c.endswith("_log1p")],
    "ratios_extra":      [c for c in X_full.columns if c in {"overdue_ratio"}],  # keep simple
    "interactions":      [c for c in X_full.columns if re.match(r"^util_x_(overdue|outstanding)$", c)],
}

# baseline
auc_base = rf_auc(X_tr, X_va, y_tr, y_va)
print(f"Baseline RF AUC: {auc_base:.4f}  | Features: {X_full.shape[1]}  | Sample: {SAMPLE_SIZE}")

# remove-one
rows = []
for name, cols in groups.items():
    if not cols: 
        rows.append((name, 0, None))
        continue
    keep = [c for c in X_full.columns if c not in cols]
    Xtr_k, Xva_k = X_tr[keep], X_va[keep]
    auc_k = rf_auc(Xtr_k, Xva_k, y_tr, y_va)
    rows.append((name, len(cols), round(auc_k - auc_base, 5)))

ablation = pd.DataFrame(rows, columns=["group","n_cols","dAUC_vs_base"]).sort_values("dAUC_vs_base")
print(ablation)

Baseline RF AUC: 0.6360  | Features: 62  | Sample: 200000
             group  n_cols  dAUC_vs_base
7     interactions       1      -0.00017
5             logs       3      -0.00016
2         any_late       4       0.00123
0      transitions       4       0.00129
1         late_std       4       0.00151
6     ratios_extra       1       0.00197
3       time_decay       1       0.00209
4  ok_share_deltas       0           NaN


## 12. Sanity importance (LightGBM)

In [67]:
# Purpose: Quick LightGBM sanity on the same sample/split; report AUC and top importances.

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# reuse the same df_s, X_tr, X_va, y_tr, y_va from ablation cell
lgbm = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=300,
    learning_rate=0.05,
    num_leaves=64,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight="balanced",
    random_state=SEED,
    n_jobs=-1,
    eval_metric="auc",
    verbose=-1
)
lgbm.fit(X_tr, y_tr, eval_set=[(X_va, y_va)])
auc_lgb = roc_auc_score(y_va, lgbm.predict_proba(X_va)[:,1])

imp_lgb = (
    pd.Series(lgbm.feature_importances_, index=X_tr.columns)
      .sort_values(ascending=False).head(30)
)

imp_path = REPORTS_DIR / "final_v3_lgbm_sanity_importance_top30.json"
imp_lgb.to_json(imp_path, indent=2)

print(f"[LGBM sanity] AUC={auc_lgb:.4f} | Features: {X_tr.shape[1]} | Sample: {len(X_tr)+len(X_va)} | Saved → {imp_path.name}")
print(imp_lgb.head(15).round(1))

[LGBM sanity] AUC=0.6452 | Features: 62 | Sample: 200000 | Saved → final_v3_lgbm_sanity_importance_top30.json
age_since_opened_minus_confirmed    1837
outstanding_to_limit                1318
pre_pterm                           1248
pre_fterm                           1240
pre_since_confirmed                 1191
late6_x_util                        1129
pre_till_pclose                     1079
pre_till_fclose                      938
maxover_to_limit                     907
pre_since_opened                     853
pre_loans_credit_limit               847
maxover_to_outstanding               463
paym_late_std_24                     429
paym_late_momentum_24                365
paym_last_late_recency               362
dtype: int32
