In [29]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [30]:
# =========================
# From-scratch WEIGHTED Regression Tree (NumPy only)
# + metrics
# + cost-complexity pruning (α) with CV
# =========================
import numpy as np
from dataclasses import dataclass
from typing import Optional, Tuple, List

# -------- Metrics --------
def wmae(y, yhat, w):
    w = np.asarray(w, float)
    return (np.abs(y - yhat) * w).sum() / w.sum()

def wrmse(y, yhat, w):
    w = np.asarray(w, float)
    return np.sqrt(((y - yhat)**2 * w).sum() / w.sum())

def weighted_r2(y, yhat, w):
    w = np.asarray(w, float)
    ybar = (y * w).sum() / w.sum()
    ss_res = ((y - yhat)**2 * w).sum()
    ss_tot = ((y - ybar)**2 * w).sum()
    return 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0

def poisson_deviance(y_counts, exposure, yhat_rate):
    y = np.asarray(y_counts, float)
    exp = np.asarray(exposure, float)
    lam = np.clip(yhat_rate, 1e-12, None) * exp  # Poisson mean = rate * exposure
    term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)
    return 2.0 * term.sum()

# -------- Helpers --------
def wmean(y, w):
    w = np.asarray(w, float)
    sw = w.sum()
    return (y * w).sum() / sw if sw > 0 else 0.0

def leaf_sse(y, w):
    mu = wmean(y, w)
    return ((y - mu) ** 2 * w).sum()

@dataclass
class Node:
    feature: Optional[int] = None
    threshold: Optional[float] = None
    left: Optional["Node"] = None
    right: Optional["Node"] = None
    value: Optional[float] = None  # prediction at leaf
    # for pruning bookkeeping
    idx: Optional[np.ndarray] = None
    sse: float = 0.0
    leaves: int = 1

    def is_leaf(self) -> bool:
        return self.value is not None

class DecisionTreeRegressorScratch:
    """
    Exposure-weighted regression tree for claim rate.
      - Split criterion: minimize weighted SSE.
      - Leaf value: exposure-weighted mean.
      - Pre-pruning: max_depth, min_leaf_weight (exposure units).
    Uses only NumPy for training/prediction.
    """
    def __init__(self, max_depth: Optional[int] = None, min_leaf_weight: float = 5.0):
        self.max_depth = max_depth
        self.min_leaf_weight = float(min_leaf_weight)
        self.root: Optional[Node] = None

    def fit(self, X: np.ndarray, y: np.ndarray, w: np.ndarray):
        X = np.asarray(X, float)
        y = np.asarray(y, float)
        w = np.asarray(w, float)
        self.root = self._build_tree(X, y, w, np.arange(X.shape[0], dtype=int), depth=0)
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X, float)
        return np.array([self._traverse(x, self.root) for x in X], float)

    # ----- internal: build tree -----
    def _build_tree(self, X, y, w, idx, depth) -> Node:
        # stopping rules
        if (self.max_depth is not None and depth >= self.max_depth) or \
           (w[idx].sum() < 2 * self.min_leaf_weight) or \
           np.allclose(y[idx], y[idx][0]):
            return Node(value=wmean(y[idx], w[idx]), idx=idx.copy())

        feat, thr, L, R = self._best_split(X, y, w, idx)
        if feat is None:
            return Node(value=wmean(y[idx], w[idx]), idx=idx.copy())

        left = self._build_tree(X, y, w, L, depth+1)
        right = self._build_tree(X, y, w, R, depth+1)
        return Node(feature=feat, threshold=thr, left=left, right=right, value=None, idx=idx.copy())

    # ----- internal: best split -----
    def _best_split(self, X, y, w, idx) -> Tuple[Optional[int], Optional[float], Optional[np.ndarray], Optional[np.ndarray]]:
        best = (None, None, None, None, np.inf)
        n, d = X.shape
        for j in range(d):
            xj = X[idx, j]
            uniq = np.unique(xj)
            if uniq.size <= 1:
                continue
            # thresholds: midpoints; for one-hot 0/1, just 0.5
            if uniq.size == 2 and uniq.min() == 0.0 and uniq.max() == 1.0:
                candidates = [0.5]
            else:
                u = np.unique(np.sort(xj))
                candidates = (u[:-1] + u[1:]) / 2.0

            for t in candidates:
                Lmask = xj <= t
                if not Lmask.any() or Lmask.all():
                    continue
                L = idx[Lmask]; R = idx[~Lmask]
                # min exposure per child
                if w[L].sum() < self.min_leaf_weight or w[R].sum() < self.min_leaf_weight:
                    continue
                sse = leaf_sse(y[L], w[L]) + leaf_sse(y[R], w[R])
                if sse < best[4]:
                    best = (j, t, L, R, sse)

        return best[0], best[1], best[2], best[3]

    # ----- internal: predict one -----
    def _traverse(self, x, node: Node) -> float:
        while not node.is_leaf():
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value

# -------- Pruning utilities (weakest-link CCP) --------
def _compute_stats(node: Node, y, w):
    """Bottom-up subtree SSE and leaf counts (uses stored node.idx)."""
    if node.value is not None:
        node.sse = leaf_sse(y[node.idx], w[node.idx])
        node.leaves = 1
        return node.sse, 1
    sL, lL = _compute_stats(node.left,  y, w)
    sR, lR = _compute_stats(node.right, y, w)
    node.sse = sL + sR
    node.leaves = lL + lR
    return node.sse, node.leaves

def _alpha_of(node: Node, y, w) -> float:
    """α_t = (SSE_if_pruned_to_leaf - SSE_subtree) / (leaves_subtree - 1)."""
    if node.value is not None:
        return np.inf
    # SSE if this node becomes a leaf:
    mu = wmean(y[node.idx], w[node.idx])
    sse_leaf = ((y[node.idx] - mu)**2 * w[node.idx]).sum()
    denom = max(node.leaves - 1, 1e-12)
    return (sse_leaf - node.sse) / denom

def _collect_internal(node: Node) -> List[Node]:
    if node is None or node.value is not None:
        return []
    return [node] + _collect_internal(node.left) + _collect_internal(node.right)

def _clone(node: Node) -> Node:
    if node is None:
        return None
    out = Node(node.feature, node.threshold,
               _clone(node.left), _clone(node.right),
               node.value,
               idx=(None if node.idx is None else node.idx.copy()),
               sse=node.sse, leaves=node.leaves)
    return out

def _prune_once(root: Node, y, w) -> Tuple[Node, float]:
    """Prune all internal nodes with minimal α; return (new_root, α*)."""
    _compute_stats(root, y, w)
    internals = _collect_internal(root)
    if not internals:
        return root, np.inf
    alphas = np.array([_alpha_of(n, y, w) for n in internals])
    alpha_star = float(np.min(alphas))
    tol = 1e-12

    def prune_mark(n: Node):
        if n.value is not None:
            return
        a = _alpha_of(n, y, w)
        if abs(a - alpha_star) <= tol:
            # turn into a leaf
            mu = wmean(y[n.idx], w[n.idx])
            n.feature = n.threshold = None
            n.left = n.right = None
            n.value = float(mu)
            n.sse = leaf_sse(y[n.idx], w[n.idx])
            n.leaves = 1
        else:
            prune_mark(n.left); prune_mark(n.right)

    prune_mark(root)
    _compute_stats(root, y, w)
    return root, alpha_star

def pruning_path(root: Node, X: np.ndarray, y: np.ndarray, w: np.ndarray) -> Tuple[List[Node], List[float]]:
    """
    Build CCP path: sequences of trees and α values starting from unpruned root.
    """
    cur = _clone(root)
    # (node.idx already stored during fit)
    _compute_stats(cur, y, w)
    trees = [_clone(cur)]
    alphas = [0.0]
    while True:
        cur, a = _prune_once(cur, y, w)
        trees.append(_clone(cur))
        alphas.append(a)
        if getattr(cur, "leaves", 1) == 1:
            break
    return trees, alphas

def predict_with_root(X: np.ndarray, root: Node) -> np.ndarray:
    X = np.asarray(X, float)
    def _one(x, node):
        while node.value is None:
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value
    return np.array([_one(X[i], root) for i in range(X.shape[0])], float)

def kfold_indices(n_samples: int, n_splits: int, seed=42, shuffle=True):
    rng = np.random.default_rng(seed)
    idx = np.arange(n_samples, dtype=int)
    if shuffle: rng.shuffle(idx)
    folds = np.array_split(idx, n_splits)
    for i in range(n_splits):
        val = folds[i]
        tr  = np.concatenate([folds[j] for j in range(n_splits) if j != i])
        yield tr, val

def cv_select_alpha(X, y, w, base_params, kfold=5, seed=7):
    """
    Select α by CV (WMAE):
      1) Build global α list from a base tree on ALL data.
      2) For each fold, grow base tree on train fold, build its path,
         and for each global α choose the smallest subtree with α' >= α.
    """
    # global α candidates
    base_all = DecisionTreeRegressorScratch(**base_params).fit(X, y, w)
    path_all, alphas_all = pruning_path(base_all.root, X, y, w)

    scores = np.zeros(len(alphas_all))
    for tr, va in kfold_indices(len(y), kfold, seed=seed):
        base = DecisionTreeRegressorScratch(**base_params).fit(X[tr], y[tr], w[tr])
        t_path, a_path = pruning_path(base.root, X[tr], y[tr], w[tr])

        # for each global α, pick first subtree with α' >= α in this fold
        for k, a in enumerate(alphas_all):
            idxs = [j for j, ap in enumerate(a_path) if ap >= a]
            jstar = idxs[0] if idxs else (len(a_path)-1)
            yhat = predict_with_root(X[va], t_path[jstar])
            scores[k] += wmae(y[va], yhat, w[va])

    scores /= kfold
    kbest = int(np.argmin(scores))
    return {"alpha": float(alphas_all[kbest]),
            "cv_wmae": float(scores[kbest]),
            "alphas": [float(a) for a in alphas_all],
            "cv_curve": scores}


In [31]:
# =============================
# Load + preprocess + tiny pre-pruning CV
# =============================
import pandas as pd
from preprocessing.preprocessing_utils import preprocess_for_tree  # your function

# Load
train = pd.read_csv("../data/claims_train.csv")
test  = pd.read_csv("../data/claims_test.csv")

# Preprocess (no scaling; OHE cats; returns rate & exposure)
X_tr, y_tr_rate, w_tr = preprocess_for_tree(train)
X_te, y_te_rate, w_te = preprocess_for_tree(test)

# Reconstruct counts for Poisson deviance
y_tr_cnt = (y_tr_rate * w_tr).to_numpy(float)
y_te_cnt = (y_te_rate * w_te).to_numpy(float)

# To NumPy
X_np = X_tr.values.astype(float)
y_np = y_tr_rate.values.astype(float)
w_np = w_tr.values.astype(float)

# Simple hold-out split for quick validation in the grid
rng = np.random.default_rng(42)
idx = np.arange(len(y_np))
rng.shuffle(idx)
cut = int(0.8 * len(idx))
tr_idx, va_idx = idx[:cut], idx[cut:]

X_tr_np, y_tr_np, w_tr_np = X_np[tr_idx], y_np[tr_idx], w_np[tr_idx]
X_va_np, y_va_np, w_va_np = X_np[va_idx], y_np[va_idx], w_np[va_idx]
y_va_cnt_np = y_tr_cnt[va_idx]  # counts for Poisson deviance

# ---- Tiny grid over pre-pruning caps (selection by WMAE) ----
depth_grid = [9, 11, 13, 15, 17, 19, 21]
leafw_grid = [7.0, 9.0, 11.0, 13.0, 15.0, 20.0, 50.0]

best = None
for d in depth_grid:
    for m in leafw_grid:
        tree = DecisionTreeRegressorScratch(max_depth=d, min_leaf_weight=m).fit(X_tr_np, y_tr_np, w_tr_np)
        yhat_va = tree.predict(X_va_np)
        score = wmae(y_va_np, yhat_va, w_va_np)
        if (best is None) or (score < best["wmae"]):
            best = {"max_depth": d, "min_leaf_weight": m, "wmae": score, "yhat_va": yhat_va}

print("Chosen pre-pruning caps:", {k: best[k] for k in ["max_depth","min_leaf_weight","wmae"]})

# Report validation metrics for the chosen caps
yhat_va = best["yhat_va"]
print("\nValidation metrics (chosen caps):")
print(" WMAE         :", wmae(y_va_np, yhat_va, w_va_np))
print(" WRMSE        :", wrmse(y_va_np, yhat_va, w_va_np))
print(" Weighted R^2 :", weighted_r2(y_va_np, yhat_va, w_va_np))
print(" Poisson Dev. :", poisson_deviance(y_va_cnt_np, w_va_np, yhat_va))


Chosen pre-pruning caps: {'max_depth': 19, 'min_leaf_weight': 20.0, 'wmae': np.float64(0.18151267421363473)}

Validation metrics (chosen caps):
 WMAE         : 0.18151267421363473
 WRMSE        : 0.7420427339046141
 Weighted R^2 : 0.008088772458327065
 Poisson Dev. : 62097.86846094347


  term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)
  term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)


In [32]:
# =============================
# Post-pruning via cost-complexity (α) + final eval
# =============================
# Fit base tree on ALL training with chosen caps
base_params = {"max_depth": best["max_depth"], "min_leaf_weight": best["min_leaf_weight"]}
base_full = DecisionTreeRegressorScratch(**base_params).fit(X_np, y_np, w_np)

# Build pruning path on full train to get α candidates (and trees)
path_trees, path_alphas = pruning_path(base_full.root, X_np, y_np, w_np)

# Cross-validate to select α (selection by WMAE)
alpha_sel = cv_select_alpha(X_np, y_np, w_np, base_params=base_params, kfold=5, seed=7)
alpha_star = alpha_sel["alpha"]
print("Chosen α:", alpha_star, "| CV WMAE:", alpha_sel["cv_wmae"])

# Pick the smallest subtree with α' >= α*
idx_candidates = [i for i, a in enumerate(path_alphas) if a >= alpha_star]
kstar = idx_candidates[0] if idx_candidates else (len(path_alphas) - 1)
final_root = path_trees[kstar]

# ---- Test evaluation ----
X_te_np = X_te.values.astype(float)
y_te_np = y_te_rate.values.astype(float)
w_te_np = w_te.values.astype(float)
yhat_te = predict_with_root(X_te_np, final_root)

print("\nTest metrics (final pruned tree):")
print(" WMAE         :", wmae(y_te_np, yhat_te, w_te_np))
print(" WRMSE        :", wrmse(y_te_np, yhat_te, w_te_np))
print(" Weighted R^2 :", weighted_r2(y_te_np, yhat_te, w_te_np))
y_te_cnt_np = (y_te_np * w_te_np).astype(float)
print(" Poisson Dev. :", poisson_deviance(y_te_cnt_np, w_te_np, yhat_te))


MemoryError: Unable to allocate 1.43 MiB for an array with shape (187280,) and data type int64