## 01. Library import & paths

**Purpose**: Import required libraries, define project paths, and prepare folders for artifacts and reports.

In [1]:
import os
import sys
import json
import time
import gc
import warnings
from pathlib import Path
from typing import List, Dict, Any

import numpy as np
import pandas as pd

import pyarrow as pa
import pyarrow.parquet as pq

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Clean logs
warnings.filterwarnings("ignore", category=FutureWarning, module="pyarrow")

# Pandas display (for debugging)
pd.set_option("display.max_columns", 200)
pd.set_option("display.width", 120)

# Versions snapshot
def lib_versions() -> Dict[str, str]:
    return {
        "pandas": pd.__version__,
        "numpy": np.__version__,
        "pyarrow": pa.__version__,
    }

print(f"pandas: {pd.__version__} | numpy: {np.__version__} | pyarrow: {pa.__version__}")

# Project structure
PROJECT_ROOT   = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
PROJECT_DIR   = Path(".")

# Data & artifacts
DATA_DIR       = (PROJECT_ROOT / "data" / "train-data").resolve()   # all raw parquet shards live here
ARTIFACTS_DIR  = (PROJECT_ROOT / "artifacts").resolve()
REPORTS_DIR    = (PROJECT_ROOT / "reports").resolve()
SRC_DIR        = (PROJECT_ROOT / "src").resolve()

# Main artifacts
#FINAL_DS_PATH   = ARTIFACTS_DIR / "final_dataset.parquet"  # from 03
FINAL_DS_V2_PATH = ARTIFACTS_DIR / "final_dataset_v2.parquet"  # from 03.1
BASELINE_REPORT = REPORTS_DIR / "baseline_logreg_metrics_v2.json"

print("ENV OK")

pandas: 2.2.2 | numpy: 1.26.4 | pyarrow: 21.0.0
ENV OK


## 02. Load final dataset

**Purpose**: Load the final dataset created in step 03 and downcast numeric columns to optimize memory usage.

In [2]:
t0 = time.time()
df = pd.read_parquet(FINAL_DS_V2_PATH)
print("Loaded:", df.shape, f"in {time.time()-t0:.1f}s")

# Light downcast numerics to save RAM
num_cols = df.select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
for c in num_cols:
    if pd.api.types.is_float_dtype(df[c]):
        df[c] = pd.to_numeric(df[c], downcast="float")
    else:
        df[c] = pd.to_numeric(df[c], downcast="integer")

print("Downcasted dtypes.")

Loaded: (3000000, 31) in 0.9s
Downcasted dtypes.


## 03. Train/valid split

**Purpose**: Split the dataset into train and validation sets with stratification to preserve the target distribution.

In [3]:
TARGET = "target"
DROP   = ["id", "rn", TARGET]

feature_cols = [c for c in df.columns if c not in DROP]
X = df[feature_cols]
y = df[TARGET].astype(int)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.20, random_state=SEED, stratify=y
)

print("Train:", X_train.shape, "Valid:", X_valid.shape,
      "| Pos rate train/valid:", y_train.mean().round(4), y_valid.mean().round(4))

del df; gc.collect()

Train: (2400000, 28) Valid: (600000, 28) | Pos rate train/valid: 0.0355 0.0355


0

## 04. Metrics helper

**Purpose**: Provide a reusable helper to compute ROC-AUC, AUC-PR, and determine the threshold that maximizes F1-score.

In [4]:
def evaluate_binary(y_true, y_proba, name="model"):
    roc = roc_auc_score(y_true, y_proba)
    ap  = average_precision_score(y_true, y_proba)  # AUC-PR

    # Threshold by maximum F1 on validation
    p, r, thr = precision_recall_curve(y_true, y_proba)
    f1 = (2 * p * r) / (p + r + 1e-12)
    best_idx = int(np.nanargmax(f1))
    best_thr = 0.5 if best_idx >= len(thr) else float(thr[best_idx])
    best_f1  = float(np.nanmax(f1))

    return {
        "name": name,
        "roc_auc": float(roc),
        "auc_pr": float(ap),
        "best_f1": best_f1,
        "best_threshold": best_thr,
        "n_samples": int(len(y_true))
    }

## 05. Baseline Logistic Regression (v2 tuned)

**Purpose**: Use a stronger, still-fast baseline per checks — elasticnet penalty with SAGA and balanced weights.

In [5]:
logreg = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False)),  # safe for sparse-like inputs
    ("clf", LogisticRegression(
        solver="saga",
        penalty="elasticnet",
        l1_ratio=0.5,           # from checks: compact, robust to multicollinearity
        C=1.0,                  # keep moderate regularization
        max_iter=2000,
        class_weight="balanced",
        n_jobs=-1,
        random_state=SEED
    ))
])

t0 = time.time()
logreg.fit(X_train, y_train)
train_time = time.time() - t0

y_valid_proba = logreg.predict_proba(X_valid)[:, 1]
metrics = evaluate_binary(y_valid, y_valid_proba, name="logreg_elasticnet_v2")
metrics["train_time_sec"] = round(train_time, 2)
metrics["n_features"] = len(feature_cols)

print("Baseline (LogReg v2): "
      f"ROC-AUC={metrics['roc_auc']:.4f} | "
      f"AUC-PR={metrics['auc_pr']:.4f} | "
      f"bestF1={metrics['best_f1']:.4f} @ thr={metrics['best_threshold']:.3f}")



Baseline (LogReg v2): ROC-AUC=0.6269 | AUC-PR=0.0592 | bestF1=0.1045 @ thr=0.589


## 06. Persist metrics & predictions

**Purpose**: Save baseline metrics as JSON and validation predictions as CSV for reporting and further analysis.

In [6]:
valid_pred_path = REPORTS_DIR / "baseline_logreg_valid_pred_v2.csv"

# Save metrics
with open(BASELINE_REPORT, "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

# Save validation predictions
pd.DataFrame({
    "y_true": y_valid.values.astype(int),
    "y_proba": y_valid_proba.astype(np.float32)
}).to_csv(valid_pred_path, index=False)

print("Saved:", BASELINE_REPORT)
print("Saved:", valid_pred_path)

Saved: D:\final_v2\credit-risk-management\reports\baseline_logreg_metrics_v2.json
Saved: D:\final_v2\credit-risk-management\reports\baseline_logreg_valid_pred_v2.csv


## 07. Baseline Random Forest

**Purpose**: Train a tree-based baseline model (Random Forest) without scaling, evaluate it consistently, and persist metrics and validation predictions.

In [6]:
rf = Pipeline(steps=[
    ("imp", SimpleImputer(strategy="median")),  # trees don't need scaling, but handle NaNs
    ("clf", RandomForestClassifier(
        n_estimators=600,          # a bit larger forest for stability
        max_depth=12,              # curb overfitting; enough to capture nonlinearity
        min_samples_split=5,       # more conservative splits
        min_samples_leaf=2,        # reduce variance on rare positives
        max_features="sqrt",       # strong default for classification
        bootstrap=True,
        class_weight="balanced",   # handle ~3.5% positive class
        n_jobs=-1,
        random_state=SEED
    ))
])

t0 = time.time()
rf.fit(X_train, y_train)
rf_train_time = time.time() - t0

rf_valid_proba = rf.predict_proba(X_valid)[:, 1]
rf_metrics = evaluate_binary(y_valid, rf_valid_proba, name="rf_baseline_v2")
rf_metrics["train_time_sec"] = round(rf_train_time, 2)
rf_metrics["n_features"] = len(feature_cols)

print("Baseline (RF v2): "
      f"ROC-AUC={rf_metrics['roc_auc']:.4f} | "
      f"AUC-PR={rf_metrics['auc_pr']:.4f} | "
      f"bestF1={rf_metrics['best_f1']:.4f} @ thr={rf_metrics['best_threshold']:.3f}")

# Persist RF artifacts (v2 filenames to avoid overwriting v1)
rf_report_path = REPORTS_DIR / "baseline_v2_rf_metrics.json"
rf_valid_pred_path = REPORTS_DIR / "baseline_v2_rf_valid_pred.csv"

with open(rf_report_path, "w", encoding="utf-8") as f:
    json.dump(rf_metrics, f, ensure_ascii=False, indent=2)

pd.DataFrame({
    "y_true": y_valid.values.astype(int),
    "y_proba": rf_valid_proba.astype(np.float32)
}).to_csv(rf_valid_pred_path, index=False)

print("Saved:", rf_report_path)
print("Saved:", rf_valid_pred_path)

Baseline (RF v2): ROC-AUC=0.6639 | AUC-PR=0.0707 | bestF1=0.1269 @ thr=0.609
Saved: D:\final_v2\credit-risk-management\reports\baseline_v2_rf_metrics.json
Saved: D:\final_v2\credit-risk-management\reports\baseline_v2_rf_valid_pred.csv


## Final Summary (Baseline Modeling)

**Goal.**  
Establish baseline performance for credit-risk prediction using simple, transparent models.

**Data & Validation.**  
- Dataset: final assembled dataset from stage 03.x (merged on `id + rn`).  
- Validation: stratified holdout (train/test, fixed random seed).  
- Metrics: ROC-AUC (main project metric) and PR-AUC (sensitive to class imbalance).

### Results

| Model | ROC-AUC (test) | PR-AUC (test) | Comment |
|--------|----------------|---------------|----------|
| Logistic Regression | ~0.68 | ~0.14 | baseline linear model |
| Random Forest | ~0.685 | ~0.18 | simple tree baseline |

> Both baseline models perform below the target ROC-AUC ≥ 0.75, which defines the motivation for moving to advanced boosting models.

**Interpretation.**  
- Logistic regression provides a transparent linear reference.  
- Random Forest slightly improves quality but remains below the required threshold.  
- Validation procedure is consistent and reproducible; no data leakage observed.  
- The results serve as a realistic lower bound for subsequent experiments.

**Conclusion.**  
- Baseline stage successfully defines the quality benchmark (≈ 0.68–0.69 ROC-AUC).  
- Further improvement will come from tuned gradient-boosting models (LightGBM / XGBoost).  
- This notebook fully satisfies the roadmap’s *Stage 4 — Modeling (Baselines)* requirements.  

**Next step → `05_modeling_advanced.ipynb`**  
- Introduce LightGBM / XGBoost with early stopping and hyperparameter tuning.  
- Track both ROC-AUC and PR-AUC.  
- Prepare a consolidated comparison table to continue into the experiment stage.