In [20]:
import sys, os
sys.path.append(os.path.abspath(".."))

In [21]:
# =========================
# From-scratch WEIGHTED Regression Tree
# (only stdlib + NumPy inside the method)
# =========================
import numpy as np
from dataclasses import dataclass
from typing import Optional, Tuple

# -------- primary metric for selection/reporting --------
def wmae(y, yhat, w):
    w = np.asarray(w, float)
    return (np.abs(y - yhat) * w).sum() / w.sum()

# -------- small helpers --------
def wmean(y, w):
    w = np.asarray(w, float)
    sw = w.sum()
    return (y * w).sum() / sw if sw > 0 else 0.0

def leaf_sse(y, w):
    mu = wmean(y, w)
    return ((y - mu) ** 2 * w).sum()

def wrmse(y, yhat, w):
    w = np.asarray(w, float)
    return np.sqrt(((y - yhat)**2 * w).sum() / w.sum())

def weighted_r2(y, yhat, w):
    w = np.asarray(w, float)
    ybar = (y * w).sum() / w.sum()
    ss_res = ((y - yhat)**2 * w).sum()
    ss_tot = ((y - ybar)**2 * w).sum()
    return 1.0 - ss_res / ss_tot if ss_tot > 0 else 0.0

def poisson_deviance(y_counts, exposure, yhat_rate):
    """
    y_counts: observed claim counts (can be reconstructed as y_rate * exposure)
    exposure: exposure weights (>=0)
    yhat_rate: predicted claim rate
    """
    y = np.asarray(y_counts, float)
    exp = np.asarray(exposure, float)
    lam = np.clip(yhat_rate, 1e-12, None) * exp  # Poisson mean = rate * exposure
    term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)
    return 2.0 * term.sum()

@dataclass
class Node:
    feature: Optional[int] = None
    threshold: Optional[float] = None
    left: Optional["Node"] = None
    right: Optional["Node"] = None
    value: Optional[float] = None  # prediction at leaf

    def is_leaf(self) -> bool:
        return self.value is not None

class DecisionTreeRegressorScratch:
    """
    Simple exposure-weighted regression tree for rates.
    - Splits minimize weighted SSE (sum of leaf SSEs).
    - Leaf prediction = exposure-weighted mean of y in the region.
    - Pre-pruning via max_depth and min_leaf_weight (exposure units).
    """
    def __init__(self, max_depth: Optional[int] = None, min_leaf_weight: float = 5.0):
        self.max_depth = max_depth
        self.min_leaf_weight = float(min_leaf_weight)
        self.root: Optional[Node] = None

    def fit(self, X: np.ndarray, y: np.ndarray, w: np.ndarray):
        X = np.asarray(X, float)
        y = np.asarray(y, float)
        w = np.asarray(w, float)
        self.root = self._build_tree(X, y, w, np.arange(X.shape[0]), depth=0)
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        X = np.asarray(X, float)
        return np.array([self._traverse(x, self.root) for x in X], float)

    # ----- internal: build tree -----
    def _build_tree(self, X, y, w, idx, depth) -> Node:
        # stopping rules
        if (self.max_depth is not None and depth >= self.max_depth) or \
           (w[idx].sum() < 2 * self.min_leaf_weight) or \
           np.allclose(y[idx], y[idx][0]):
            return Node(value=wmean(y[idx], w[idx]))

        feat, thr, L, R = self._best_split(X, y, w, idx)
        if feat is None:
            return Node(value=wmean(y[idx], w[idx]))

        left = self._build_tree(X, y, w, L, depth+1)
        right = self._build_tree(X, y, w, R, depth+1)
        return Node(feature=feat, threshold=thr, left=left, right=right)

    # ----- internal: best split -----
    def _best_split(self, X, y, w, idx) -> Tuple[Optional[int], Optional[float], Optional[np.ndarray], Optional[np.ndarray]]:
        best = (None, None, None, None, np.inf)
        n, d = X.shape
        for j in range(d):
            xj = X[idx, j]
            uniq = np.unique(xj)
            if uniq.size <= 1:
                continue
            # thresholds: midpoints; for one-hot 0/1, just 0.5
            if uniq.size == 2 and uniq.min() == 0.0 and uniq.max() == 1.0:
                candidates = [0.5]
            else:
                u = np.unique(np.sort(xj))
                candidates = (u[:-1] + u[1:]) / 2.0

            for t in candidates:
                Lmask = xj <= t
                if not Lmask.any() or Lmask.all():
                    continue
                L = idx[Lmask]; R = idx[~Lmask]
                # exposure-weighted minimum leaf size
                if w[L].sum() < self.min_leaf_weight or w[R].sum() < self.min_leaf_weight:
                    continue
                sse = leaf_sse(y[L], w[L]) + leaf_sse(y[R], w[R])
                if sse < best[4]:
                    best = (j, t, L, R, sse)

        return best[0], best[1], best[2], best[3]

    # ----- internal: predict one -----
    def _traverse(self, x, node: Node) -> float:
        while not node.is_leaf():
            node = node.left if x[node.feature] <= node.threshold else node.right
        return node.value


In [22]:
import pandas as pd
from preprocessing.preprocessing_utils import preprocess_for_tree

train = pd.read_csv("../data/claims_train.csv")
test  = pd.read_csv("../data/claims_test.csv")

X_tr, y_tr_rate, w_tr = preprocess_for_tree(train)
X_te, y_te_rate, w_te = preprocess_for_tree(test)

# Reconstruct counts for Poisson deviance (aligned with X rows)
y_tr_cnt = (y_tr_rate * w_tr).to_numpy(float)
y_te_cnt = (y_te_rate * w_te).to_numpy(float)

# Convert to NumPy
X_np = X_tr.values.astype(float)
y_np = y_tr_rate.values.astype(float)
w_np = w_tr.values.astype(float)

# Simple train/val split
rng = np.random.default_rng(42)
idx = np.arange(len(y_np))
rng.shuffle(idx)
cut = int(0.8 * len(idx))
tr_idx, va_idx = idx[:cut], idx[cut:]

X_tr_np, y_tr_np, w_tr_np = X_np[tr_idx], y_np[tr_idx], w_np[tr_idx]
X_va_np, y_va_np, w_va_np = X_np[va_idx], y_np[va_idx], w_np[va_idx]
y_tr_cnt_np, y_va_cnt_np = y_tr_cnt[tr_idx], y_tr_cnt[va_idx]  # counts for deviance

# =============================
# Tiny CV grid (WMAE for selection)
# =============================
depth_grid = [9, 11, 13, 15, 17, 19, 21]
leafw_grid = [7.0, 9.0, 11.0, 13.0, 15.0, 20.0, 50.0]

best = None
for d in depth_grid:
    for m in leafw_grid:
        tree = DecisionTreeRegressorScratch(max_depth=d, min_leaf_weight=m).fit(X_tr_np, y_tr_np, w_tr_np)
        yhat_va = tree.predict(X_va_np)
        score = wmae(y_va_np, yhat_va, w_va_np)
        if (best is None) or (score < best["wmae"]):
            best = {"max_depth": d, "min_leaf_weight": m, "wmae": score, "model": tree, "yhat_va": yhat_va}

print("Chosen caps:", {k: best[k] for k in ["max_depth","min_leaf_weight","wmae"]})

# ---- Validation metrics for the chosen model ----
yhat_va = best["yhat_va"]
print("\nValidation metrics (chosen caps):")
print(" WMAE           :", wmae(y_va_np, yhat_va, w_va_np))
print(" WRMSE          :", wrmse(y_va_np, yhat_va, w_va_np))
print(" Weighted R^2   :", weighted_r2(y_va_np, yhat_va, w_va_np))
print(" Poisson Dev.   :", poisson_deviance(y_va_cnt_np, w_va_np, yhat_va))

# =============================
# Final fit on ALL training data with chosen caps
# =============================
final_tree = DecisionTreeRegressorScratch(
    max_depth=best["max_depth"],
    min_leaf_weight=best["min_leaf_weight"]
).fit(X_np, y_np, w_np)

# =============================
# Test evaluation (if labels exist)
# =============================
if X_te is not None and y_te_rate is not None:
    X_te_np = X_te.values.astype(float)
    y_te_np = y_te_rate.values.astype(float)
    w_te_np = w_te.values.astype(float)
    yhat_te = final_tree.predict(X_te_np)

    print("\nTest metrics:")
    print(" WMAE           :", wmae(y_te_np, yhat_te, w_te_np))
    print(" WRMSE          :", wrmse(y_te_np, yhat_te, w_te_np))
    print(" Weighted R^2   :", weighted_r2(y_te_np, yhat_te, w_te_np))
    # reconstruct counts for test if not already:
    y_te_cnt_np = (y_te_np * w_te_np).astype(float)
    print(" Poisson Dev.   :", poisson_deviance(y_te_cnt_np, w_te_np, yhat_te))

Chosen caps: {'max_depth': 19, 'min_leaf_weight': 20.0, 'wmae': np.float64(0.18151267421363473)}

Validation metrics (chosen caps):
 WMAE           : 0.18151267421363473
 WRMSE          : 0.7420427339046141
 Weighted R^2   : 0.008088772458327065
 Poisson Dev.   : 62097.86846094347


  term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)
  term = np.where(y > 0, y * np.log(y / lam), 0.0) - (y - lam)



Test metrics:
 WMAE           : 0.18274868062615726
 WRMSE          : 0.7753352886417393
 Weighted R^2   : 0.011416014677926745
 Poisson Dev.   : 74588.3451497409
