# Logistic Regression & Calibration â€” Instructor (Solutions + Rationale)

In [None]:
import numpy as np

def check(name: str, cond: bool):
    if not cond:
        raise AssertionError(f'Failed: {name}')
    print(f'OK: {name}')

rng = np.random.default_rng(0)

In [None]:
def make_probs(n=5000, base_rate=0.05, logit_scale=1.0, miscalibration=1.0):
    z = logit_scale * rng.standard_normal(n)
    z = z + np.log(base_rate/(1-base_rate))
    p_true = 1/(1+np.exp(-z))
    y = (rng.random(n) < p_true).astype(int)
    z_model = miscalibration * z
    p_model = 1/(1+np.exp(-z_model))
    return y, p_model, p_true

y, p_model, p_true = make_probs(miscalibration=2.0)
print('base_rate', y.mean())

In [None]:
def metrics_at_threshold(y, p, t):
    y = y.astype(int)
    yhat = (p >= t).astype(int)
    tp = int(np.sum((yhat==1) & (y==1)))
    fp = int(np.sum((yhat==1) & (y==0)))
    fn = int(np.sum((yhat==0) & (y==1)))
    prec = tp / (tp + fp + 1e-12)
    rec = tp / (tp + fn + 1e-12)
    f1 = 2*prec*rec / (prec+rec+1e-12)
    return {'tp':tp,'fp':fp,'fn':fn,'precision':prec,'recall':rec,'f1':f1}

print(metrics_at_threshold(y, p_model, 0.5))
# Rationale: for imbalance, precision/recall capture tradeoff better than accuracy.

In [None]:
def pr_curve(y, p):
    order = np.argsort(-p)
    y_sorted = y[order]
    tp = np.cumsum(y_sorted == 1)
    fp = np.cumsum(y_sorted == 0)
    prec = tp / (tp + fp + 1e-12)
    rec = tp / (tp[-1] + 1e-12)
    return rec, prec

def auc_trapz(x, y):
    # assume x increasing
    return float(np.trapz(y, x))

rec, prec = pr_curve(y, p_model)
pr_auc = auc_trapz(rec, prec)
print('pr_auc', pr_auc)
# Rationale: PR is more sensitive to minority class performance.

In [None]:
def reliability_bins(y, p, n_bins=10):
    y = y.astype(int)
    edges = np.linspace(0, 1, n_bins+1)
    b = np.digitize(p, edges[1:-1], right=False)
    bin_acc = np.zeros(n_bins)
    bin_conf = np.zeros(n_bins)
    bin_frac = np.zeros(n_bins)
    for i in range(n_bins):
        mask = (b == i)
        if mask.any():
            bin_acc[i] = y[mask].mean()
            bin_conf[i] = p[mask].mean()
            bin_frac[i] = mask.mean()
    return bin_acc, bin_conf, bin_frac

def ece(bin_acc, bin_conf, bin_frac):
    return float(np.sum(bin_frac * np.abs(bin_acc - bin_conf)))

ECE = ece(*reliability_bins(y, p_model, 10))
print('ECE', ECE)
# Rationale: ECE measures calibration gap; ranking can be good even if calibration is poor.

In [None]:
def logit(p, eps=1e-12):
    p = np.clip(p, eps, 1-eps)
    return np.log(p/(1-p))

def nll(y, p, eps=1e-12):
    p = np.clip(p, eps, 1-eps)
    return float(-np.mean(y*np.log(p) + (1-y)*np.log(1-p)))

idx = rng.permutation(len(y))
val = idx[: len(y)//2]
test = idx[len(y)//2:]

z = logit(p_model)

Ts = np.linspace(0.5, 5.0, 50)
best_T = None
best_loss = float('inf')
for T in Ts:
    pT = 1/(1+np.exp(-(z[val]/T)))
    L = nll(y[val], pT)
    if L < best_loss:
        best_loss = L
        best_T = T

p_cal = 1/(1+np.exp(-(z[test]/best_T)))
ECE_before = ece(*reliability_bins(y[test], p_model[test], 10))
ECE_after = ece(*reliability_bins(y[test], p_cal, 10))
print('best_T', best_T)
print('ECE_before', ECE_before, 'ECE_after', ECE_after)
# Rationale: temperature scaling rescales logits, improving calibration while preserving ranking.

In [None]:
def best_threshold_cost(y, p, c_fp=1.0, c_fn=10.0):
    ts = np.linspace(0, 1, 501)
    best_t = 0.5
    best_cost = float('inf')
    for t in ts:
        yhat = (p >= t).astype(int)
        fp = np.sum((yhat==1) & (y==0))
        fn = np.sum((yhat==0) & (y==1))
        cost = c_fp*fp + c_fn*fn
        if cost < best_cost:
            best_cost = cost
            best_t = float(t)
    return best_t, float(best_cost)

t_star, cost_star = best_threshold_cost(y, p_model, c_fp=1.0, c_fn=10.0)
print('t*', t_star, 'cost', cost_star)
# Rationale: thresholding is a decision problem; optimize costs, not accuracy.