## 01. Library import & paths

**Purpose**: Import libs, set reproducibility, define project paths, and prepare artifacts/reports folders.

In [1]:
import os
import sys
import json
import time
import gc
import warnings
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Clean logs
warnings.filterwarnings("ignore", category=FutureWarning, module="pyarrow")

# Pandas display (debug convenience)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

# Version snapshot (for run metadata)
def lib_versions() -> Dict[str, str]:
    return {
        "pandas": pd.__version__,
        "numpy": np.__version__,
        "pyarrow": pa.__version__,
    }

print(f"pandas: {pd.__version__} | numpy: {np.__version__} | pyarrow: {pa.__version__}")

# Project structure (support running from notebooks/ or project root)
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
SRC_DIR      = (PROJECT_ROOT / "src").resolve()
DATA_DIR     = (PROJECT_ROOT / "data").resolve()
ARTIFACTS_DIR= (PROJECT_ROOT / "artifacts").resolve()
REPORTS_DIR  = (PROJECT_ROOT / "reports").resolve()
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

# Ensure src is importable (for features_extended.py)
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Artifacts (v1 & v2)
FINAL_DS_V1_PATH = ARTIFACTS_DIR / "final_dataset.parquet"        # produced by 03
FINAL_DS_V2_PATH = ARTIFACTS_DIR / "final_dataset_v2.parquet"     # this notebook output
META_V2_JSON     = REPORTS_DIR  / "final_v2_meta.json"            # run metadata
FEATURE_LIST_V2  = REPORTS_DIR  / "final_v2_feature_list.json"    # feature inventory

print("ENV OK")

pandas: 2.2.2 | numpy: 1.26.4 | pyarrow: 21.0.0
ENV OK


## 02. Load base final v1

**Purpose**: Load the existing final dataset (v1) as a baseline input, apply light downcasting, and set run-specific report paths.

In [2]:
from importlib import import_module

# Align report filenames with your existing convention in /reports
META_V2_JSON    = REPORTS_DIR / "03_1_final_dataset_v2_report.json"   # will be created later
FEATURE_LIST_V2 = REPORTS_DIR / "final_v2_feature_list.json"          # will be created later

# Optional safe reader from src.utils (falls back to pandas if not available)
def read_parquet_safe(path: Path) -> pd.DataFrame:
    try:
        m = import_module("utils")
        if hasattr(m, "read_parquet_safe"):
            return m.read_parquet_safe(path)
    except Exception:
        pass
    return pd.read_parquet(path)

t0 = time.time()
df_base = read_parquet_safe(FINAL_DS_V1_PATH)
print(f"Loaded v1 dataset: {df_base.shape} in {time.time()-t0:.1f}s")

# Light numeric downcast to save RAM
num_cols = df_base.select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
for c in num_cols:
    if pd.api.types.is_float_dtype(df_base[c]):
        df_base[c] = pd.to_numeric(df_base[c], downcast="float")
    else:
        df_base[c] = pd.to_numeric(df_base[c], downcast="integer")

TARGET   = "target"
ID_COLS  = ["id", "rn"]

print("Downcasted. Columns:", df_base.shape[1], "| Target:", TARGET, "| IDs:", ID_COLS)
gc.collect()

Loaded v1 dataset: (3000000, 45) in 0.4s
Downcasted. Columns: 45 | Target: target | IDs: ['id', 'rn']


0

## 03. Init FeatureGeneratorExtended (config)

**Purpose**: Configure and initialize the extended feature generator; prepare payment matrix and basic settings.

In [3]:
import importlib
import utils.features as f
import utils.features_extended as fe
importlib.reload(f)
importlib.reload(fe)

from utils.features_extended import FeatureConfigExtended, FeatureGeneratorExtended

In [4]:
cfg = FeatureConfigExtended(verbose=True)
fg  = FeatureGeneratorExtended(cfg)
print(type(cfg).__name__, type(fg).__name__)

FeatureConfigExtended FeatureGeneratorExtended


In [5]:
from utils.features_extended import FeatureConfigExtended, FeatureGeneratorExtended

# Detect payment columns (enc_paym_0 is the most recent by project convention)
PAYM_COLS = sorted([c for c in df_base.columns if c.startswith("enc_paym_")],
                   key=lambda s: int(s.split("_")[-1]))  # enc_paym_0, enc_paym_1, ...

paym_df = df_base[PAYM_COLS].copy() if len(PAYM_COLS) else pd.DataFrame(index=df_base.index)

# Configure which groups to enable; tweak windows/caps if needed
cfg_ext = FeatureConfigExtended(
    use_payment_seq=True,
    use_ratios=True,
    use_bucket_severity=True,
    use_interactions=True,
    windows=[3, 6, 12, 24],
    cap_outliers=True,
    cap_bounds={"_default": (1, 99)},  # per-column overrides can be added later
    eps=1e-4,
    # Payment code mapping (adjust if your scheme differs)
    paym_ok_values=(0, 1),
    paym_late_values=(2, 3, 4, 5, 6, 7, 8, 9),
    verbose=True,
)

fg_ext = FeatureGeneratorExtended(cfg_ext)

print(f"PAYM_COLS: {len(PAYM_COLS)} | df_base: {df_base.shape} | paym_df: {paym_df.shape}")

PAYM_COLS: 25 | df_base: (3000000, 45) | paym_df: (3000000, 25)


## 04. Generate extended features

**Purpose**: Build extended features using FeatureGeneratorExtended and assemble v2 dataset (id, rn, features, target).

In [6]:
t_start = time.time()

# Keep meta/target
ID_COLS = ["id", "rn"]
TARGET  = "target"

# Separate payment matrix (already prepared as paym_df) and pass the full base frame to the generator
X_in  = df_base.copy()
X_ext = fg_ext.transform(X_in, paym_df)  # adds extended features on a copy

# Ensure target and ids are present and first in order
cols_order = ID_COLS + [TARGET] + [c for c in X_ext.columns if c not in ID_COLS + [TARGET]]
df_v2 = X_ext[cols_order].copy()

print(f"Extended features built: +{df_v2.shape[1] - df_base.shape[1]} columns "
      f"(total {df_v2.shape[1]}) in {time.time()-t_start:.1f}s")

# Memory relief
del X_in, X_ext
gc.collect()

[just] paym_ok_share_3: Share of OK payments within last 3 periods.
[just] paym_late_share_3: Share of late payments within last 3 periods.
[just] paym_ok_share_6: Share of OK payments within last 6 periods.
[just] paym_late_share_6: Share of late payments within last 6 periods.
[just] paym_ok_share_12: Share of OK payments within last 12 periods.
[just] paym_late_share_12: Share of late payments within last 12 periods.
[just] paym_ok_share_24: Share of OK payments within last 24 periods.
[just] paym_late_share_24: Share of late payments within last 24 periods.
[just] paym_longest_ok_streak_24: Longest consecutive OK streak within ~24 periods.
[just] paym_longest_late_streak_24: Longest consecutive LATE streak within ~24 periods.
[just] paym_last_late_recency: Recency of the last late event (0=now, 1=prev, NaN=never).
[just] paym_last_ok_recency: Recency of the last OK event (0=now, 1=prev, NaN=never).
[just] paym_ok_trend_6: Slope of OK share over last 6 periods (trend of discipline).

0

## 05. Quick post-check

**Purpose**: Perform light validation of the v2 dataset — shape, NaN rate, duplicates, class balance, and feature overview.

In [7]:
def quick_postcheck(df: pd.DataFrame, target_col: str = "target") -> Dict[str, Any]:
    """Compute quick validation summary for the dataset."""
    report = {}
    report["shape"] = df.shape
    report["columns"] = len(df.columns)
    report["nans_total"] = int(df.isna().sum().sum())
    report["nan_rate_pct"] = float(df.isna().mean().mean() * 100)

    # Duplicates by id+rn
    if all(col in df.columns for col in ["id", "rn"]):
        dup_count = df.duplicated(subset=["id", "rn"]).sum()
    else:
        dup_count = df.duplicated().sum()
    report["duplicates"] = int(dup_count)

    # Target distribution
    if target_col in df.columns:
        vc = df[target_col].value_counts(dropna=False, normalize=True)
        report["target_dist"] = {int(k): float(v) for k, v in vc.items()}

    # Null-rate per column (top 10)
    nulls = df.isna().mean().sort_values(ascending=False).head(10).to_dict()
    report["top10_null_cols"] = {k: round(v * 100, 2) for k, v in nulls.items()}

    return report

qc_report = quick_postcheck(df_v2, target_col=TARGET)

print(f"Dataset v2 shape: {qc_report['shape']}")
print(f"Total NaNs: {qc_report['nans_total']} ({qc_report['nan_rate_pct']:.2f}%)")
print(f"Duplicates by id+rn: {qc_report['duplicates']}")
print("Target distribution:", qc_report.get("target_dist", {}))
print("Top 10 NaN-rate columns (%):", qc_report["top10_null_cols"])

# Save quick summary to report JSON
quick_report_path = REPORTS_DIR / "03_1_final_dataset_v2_quickcheck.json"
with open(quick_report_path, "w", encoding="utf-8") as f:
    json.dump(qc_report, f, ensure_ascii=False, indent=2)

print("Saved:", quick_report_path)

Dataset v2 shape: (3000000, 63)
Total NaNs: 333872 (0.18%)
Duplicates by id+rn: 0
Target distribution: {0: 0.9645193333333333, 1: 0.03548066666666667}
Top 10 NaN-rate columns (%): {'paym_last_ok_recency': 7.76, 'paym_last_late_recency': 3.37, 'id': 0.0, 'paym_late_share_3': 0.0, 'pre_loans_credit_limit': 0.0, 'pre_loans_max_overdue_sum': 0.0, 'pre_loans_outstanding': 0.0, 'pre_loans_total_overdue': 0.0, 'pre_pterm': 0.0, 'pre_since_confirmed': 0.0}
Saved: D:\final_v2\credit-risk-management\reports\03_1_final_dataset_v2_quickcheck.json


## 06. Extended checks

**Purpose**: Run extended validation — correlation snapshot, identical columns scan, and basic schema metadata (safe & sampled).

In [8]:
from hashlib import md5

# Config for safe computation on large frames
EXT_SAMPLE_ROWS = 200_000   # sample for correlation to avoid OOM
TOP_K_CORR_PAIRS = 30       # how many top correlated pairs to keep
CORR_THRESHOLD = 0.98       # flag pairs with |corr| >= threshold

def sample_df(df: pd.DataFrame, n: int) -> pd.DataFrame:
    if len(df) <= n:
        return df
    return df.sample(n=n, random_state=SEED).reset_index(drop=True)

def top_correlated_pairs(df: pd.DataFrame, k: int, thr: float) -> List[Dict[str, Any]]:
    num_cols = df.select_dtypes(include=["number"]).columns.tolist()
    if not num_cols:
        return []
    df_s = sample_df(df[num_cols], EXT_SAMPLE_ROWS)
    corr = df_s.corr(numeric_only=True).abs()
    # extract upper triangle without diagonal
    pairs = []
    cols = corr.columns.tolist()
    for i in range(len(cols)):
        for j in range(i+1, len(cols)):
            c = corr.iat[i, j]
            if not np.isnan(c):
                pairs.append((cols[i], cols[j], float(c)))
    pairs.sort(key=lambda x: x[2], reverse=True)
    out = []
    for a, b, v in pairs[:k]:
        out.append({"col_a": a, "col_b": b, "abs_corr": v, "flag_high": v >= thr})
    return out

def find_identical_columns(df: pd.DataFrame, ignore_cols: List[str]) -> List[List[str]]:
    """Group columns that are byte-identical (after casting NaN to a sentinel)."""
    cols = [c for c in df.columns if c not in ignore_cols]
    sig2cols: Dict[str, List[str]] = {}
    for c in cols:
        s = df[c]
        # normalize to bytes signature
        vals = s.fillna(np.nan).to_numpy()
        try:
            data = vals.tobytes()
        except Exception:
            # fallback: convert to string (slower but safe)
            data = "|".join(map(str, s.fillna("NaN").tolist())).encode("utf-8")
        key = md5(data).hexdigest()
        sig2cols.setdefault(key, []).append(c)
    # keep only groups with 2+ members
    return [v for v in sig2cols.values() if len(v) >= 2]

ext_report: Dict[str, Any] = {}

# Basic schema meta
ext_report["n_rows"] = int(len(df_v2))
ext_report["n_cols"] = int(df_v2.shape[1])
ext_report["numeric_cols"] = int(df_v2.select_dtypes(include=["number"]).shape[1])
ext_report["object_cols"]  = int(df_v2.select_dtypes(include=["object"]).shape[1])

# Correlation snapshot (safe sample)
t0 = time.time()
ext_report["top_corr_pairs"] = top_correlated_pairs(df_v2.drop(columns=["id","rn","target"], errors="ignore"),
                                                    k=TOP_K_CORR_PAIRS, thr=CORR_THRESHOLD)
ext_report["corr_snapshot_time_sec"] = round(time.time() - t0, 2)

# Identical columns (byte-level)
t0 = time.time()
ident_groups = find_identical_columns(df_v2.drop(columns=["id","rn","target"], errors="ignore"),
                                      ignore_cols=[])
ext_report["identical_column_groups"] = ident_groups[:20]  # limit in report
ext_report["identical_scan_time_sec"] = round(time.time() - t0, 2)

# Save extended report
ext_report_path = REPORTS_DIR / "03_1_final_dataset_v2_extcheck.json"
with open(ext_report_path, "w", encoding="utf-8") as f:
    json.dump(ext_report, f, ensure_ascii=False, indent=2)

print(f"Extended checks saved → {ext_report_path}")
print(f"Top {len(ext_report['top_corr_pairs'])} corr pairs (abs):",
      [(p['col_a'], p['col_b'], round(p['abs_corr'],4)) for p in ext_report['top_corr_pairs'][:5]])
print(f"Identical groups found: {len(ident_groups)} (showing up to 20 in report)")

gc.collect()

Extended checks saved → D:\final_v2\credit-risk-management\reports\03_1_final_dataset_v2_extcheck.json
Top 30 corr pairs (abs): [('paym_ok_share_3', 'paym_late_share_3', 1.0), ('paym_ok_share_24', 'paym_late_share_24', 1.0), ('enc_paym_0', 'paym_last_status', 1.0), ('paym_ok_share_12', 'paym_late_share_12', 1.0), ('paym_ok_share_6', 'paym_late_share_6', 1.0)]
Identical groups found: 0 (showing up to 20 in report)


33

## 07. Save dataset v2 (+ meta & feature inventory)

**Purpose**: Apply sentinel imputation, prune redundant features, and persist v2 dataset with metadata & feature list.

In [9]:
# --- Sentinel imputation for recency features ---
RECENCY_SENTINEL = 25  # since we have enc_paym_0..24 (no event → 25)
for col in ["paym_last_late_recency", "paym_last_ok_recency"]:
    if col in df_v2.columns:
        df_v2[col] = df_v2[col].fillna(RECENCY_SENTINEL).astype("float32")

# --- Feature pruning (to reduce perfect correlation) ---
drop_cols = []

# Drop paym_ok_share_* (perfectly anticorrelated with paym_late_share_*)
drop_cols += [c for c in df_v2.columns if c.startswith("paym_ok_share_")]

# Drop redundant streaks and shares except compact subset
keep_late_shares = {"paym_late_share_6", "paym_late_share_24"}
keep_streaks     = {"paym_longest_late_streak_24"}
keep_trend       = {"paym_ok_trend_6"}
keep_recency     = {"paym_last_late_recency"}
keep_set = keep_late_shares | keep_streaks | keep_trend | keep_recency

drop_cols += [
    c for c in df_v2.columns
    if c.startswith("paym_late_share_") or
       c.startswith("paym_longest_") or
       c.startswith("paym_ok_trend_") or
       c.startswith("paym_last_")
]
drop_cols = [c for c in drop_cols if c not in keep_set]

# Drop paym_last_status (duplicate of enc_paym_0)
if "paym_last_status" in df_v2.columns:
    drop_cols.append("paym_last_status")

# Drop excessive enc_paym_* columns except first three
paym_raw_cols = sorted([c for c in df_v2.columns if c.startswith("enc_paym_")],
                       key=lambda s: int(s.split("_")[-1]))
drop_cols += paym_raw_cols[3:]  # keep enc_paym_0, enc_paym_1, enc_paym_2

# Apply drop
before_cols = df_v2.shape[1]
df_v2.drop(columns=list(set(drop_cols)), inplace=True, errors="ignore")
after_cols = df_v2.shape[1]
print(f"Pruned {before_cols - after_cols} redundant features; {after_cols} remain.")

# --- Save dataset v2 ---
t0 = time.time()
df_v2.to_parquet(FINAL_DS_V2_PATH, index=False)
save_time = round(time.time() - t0, 2)
print(f"Saved {FINAL_DS_V2_PATH.name} | {df_v2.shape} in {save_time}s")

# --- Save feature inventory ---
feature_list = [c for c in df_v2.columns if c not in ["id", "rn", "target"]]
with open(FEATURE_LIST_V2, "w", encoding="utf-8") as f:
    json.dump(sorted(feature_list), f, ensure_ascii=False, indent=2)
print(f"Saved feature list: {FEATURE_LIST_V2}")

# --- Save metadata report ---
meta = {
    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
    "seed": SEED,
    "n_rows": int(len(df_v2)),
    "n_features": len(feature_list),
    "save_time_sec": save_time,
    "removed_features": sorted(list(set(drop_cols))),
    "recency_sentinel": RECENCY_SENTINEL,
    "lib_versions": lib_versions(),
}
with open(META_V2_JSON, "w", encoding="utf-8") as f:
    json.dump(meta, f, ensure_ascii=False, indent=2)
print(f"Saved meta report: {META_V2_JSON}")

gc.collect()

Pruned 32 redundant features; 31 remain.
Saved final_dataset_v2.parquet | (3000000, 31) in 2.49s
Saved feature list: D:\final_v2\credit-risk-management\reports\final_v2_feature_list.json
Saved meta report: D:\final_v2\credit-risk-management\reports\03_1_final_dataset_v2_report.json


66

## 08. Changelog: v1 vs v2 feature diff

**Purpose**: Compare feature sets between final_dataset (v1) and final_dataset_v2 (v2), save a concise changelog.

In [10]:
def load_v1_columns(path: Path) -> List[str]:
    df = pd.read_parquet(path, columns=None)
    return df.columns.tolist()

# Load v1 columns (no heavy ops)
v1_cols = load_v1_columns(FINAL_DS_V1_PATH)
v2_cols = df_v2.columns.tolist()

# Keep only feature columns (exclude meta/target)
def feat_only(cols: List[str]) -> List[str]:
    return [c for c in cols if c not in ("id", "rn", "target")]

v1_feats = set(feat_only(v1_cols))
v2_feats = set(feat_only(v2_cols))

added    = sorted(v2_feats - v1_feats)
removed  = sorted(v1_feats - v2_feats)
common   = sorted(v1_feats & v2_feats)

changelog = {
    "v1_feature_count": len(v1_feats),
    "v2_feature_count": len(v2_feats),
    "added_count": len(added),
    "removed_count": len(removed),
    "common_count": len(common),
    "added": added[:200],      # cap lists in report
    "removed": removed[:200],
}

change_path = REPORTS_DIR / "final_v2_changelog.json"
with open(change_path, "w", encoding="utf-8") as f:
    json.dump(changelog, f, ensure_ascii=False, indent=2)

print(f"Saved changelog → {change_path}")
print(f"v1→v2: +{len(added)} / -{len(removed)} | common={len(common)}")

Saved changelog → D:\final_v2\credit-risk-management\reports\final_v2_changelog.json
v1→v2: +10 / -24 | common=18


## 09. Sanity importance (light RandomForest)

**Purpose**: Train a lightweight RandomForest on a sampled subset to estimate relative feature importances (quick signal check).

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

t0 = time.time()

# Sample subset for quick training (keep stratified balance)
SAMPLE_SIZE = 200_000 if len(df_v2) > 200_000 else len(df_v2)
df_sample = df_v2.sample(SAMPLE_SIZE, random_state=SEED)
X = df_sample.drop(columns=["id", "rn", "target"], errors="ignore")
y = df_sample["target"].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=SEED
)

rf_sanity = RandomForestClassifier(
    n_estimators=50,
    max_depth=8,
    n_jobs=-1,
    class_weight="balanced",
    random_state=SEED,
)
rf_sanity.fit(X_train, y_train)
pred = rf_sanity.predict_proba(X_valid)[:, 1]
auc = roc_auc_score(y_valid, pred)

importances = (
    pd.Series(rf_sanity.feature_importances_, index=X.columns)
    .sort_values(ascending=False)
    .head(20)
    .round(4)
)

# Save to report
importance_path = REPORTS_DIR / "final_v2_feature_importance_rf_sanity.json"
with open(importance_path, "w", encoding="utf-8") as f:
    json.dump(importances.to_dict(), f, ensure_ascii=False, indent=2)

print(f"RF sanity AUC={auc:.4f} | Saved top-20 importances → {importance_path}")
print(importances)

gc.collect()

RF sanity AUC=0.6433 | Saved top-20 importances → D:\final_v2\credit-risk-management\reports\final_v2_feature_importance_rf_sanity.json
enc_paym_2                     0.0952
paym_last_late_recency         0.0932
enc_paym_0                     0.0868
enc_paym_1                     0.0750
paym_longest_late_streak_24    0.0584
outstanding_to_limit           0.0561
paym_late_share_24             0.0551
maxover_to_limit               0.0547
pre_since_opened               0.0523
pre_pterm                      0.0440
pre_loans_credit_limit         0.0430
pre_loans_max_overdue_sum      0.0402
pre_till_fclose                0.0392
pre_till_pclose                0.0387
pre_fterm                      0.0354
paym_late_share_6              0.0347
pre_loans_outstanding          0.0298
pre_since_confirmed            0.0293
paym_ok_trend_6                0.0154
bucket_severity_score          0.0125
dtype: float64


120