## Bayesian Binning into Quantiles (BBQ) Calibration Method, OVERVIEW

### Bayesian Binning into Quantiles is a calibration method, that does yada yada blah blah.

## 2.1 Load the model

We are going to use the pretrained ResNet18 Model from Torchvision for simplicity.

In [3]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchvision
from torchvision.models import ResNet18_Weights
import torchvision.transforms as T
from tqdm import tqdm

device = torch.device("cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

net = torchvision.models.resnet18(weights=ResNet18_Weights.IMAGENET1K_V1)
net.fc = nn.Linear(512, 10, device=device)
net = net.to(device)

Using device: cpu


## 2.2 Load the data

Here we are going to load the CIFAR10 dataset from torchvision to do a quick training session...

In [4]:
transforms = T.Compose([T.ToTensor()])

train = torchvision.datasets.CIFAR10(root="~/datasets", train=True, download=True, transform=transforms)
train, cal = torch.utils.data.random_split(train, [0.8, 0.2])
test = torchvision.datasets.CIFAR10(root="~/datasets", train=False, download=True, transform=transforms)
train_loader = DataLoader(train, batch_size=256, shuffle=True)
cal_loader = DataLoader(cal, batch_size=256, shuffle=True)
test_loader = DataLoader(test, batch_size=256, shuffle=False)

## 2.3 Train the model

So let's give the model a quick training session with our dataset.

In [None]:
epochs = 5
optimizer = optim.Adam(net.parameters())
criterion = nn.CrossEntropyLoss()
for epoch in tqdm(range(epochs)):
    net.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = net(inputs.to(device))
        loss = criterion(outputs, targets.to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch + 1}, Running loss: {running_loss / len(train_loader)}")

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
# demo_bbq_calibration.py
import importlib

import numpy as np
import torch

import probly.calibration.bayesian_binning.torch
from probly.calibration.bayesian_binning.torch import BayesianBinningQuantiles
from probly.evaluation.metrics import brier_score, expected_calibration_error

importlib.reload(probly.calibration.bayesian_binning.torch)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# assume net, cal_loader, test_loader are defined elsewhere
net.to(device)
net.eval()

# --- Step 1: collect calibration logits/targets
cal_logits_list, cal_targets_list = [], []
with torch.no_grad():
    for inputs, targets in cal_loader:
        inputs = inputs.to(device)
        outputs = net(inputs)            # logits on device
        cal_logits_list.append(outputs.cpu())
        cal_targets_list.append(targets) # targets assumed on CPU already

cal_logits = torch.cat(cal_logits_list, dim=0)
cal_targets = torch.cat(cal_targets_list, dim=0)

# --- Step 2: convert logits -> probabilities
cal_probs = F.softmax(cal_logits, dim=1)  # CPU tensor

# --- Step 3: init calibrators (one per class)
n_classes = cal_probs.shape[1]
bbq_calibrators = [None] * n_classes

# --- Step 4: fit per-class calibrator
for class_idx in range(n_classes):
    class_probs = cal_probs[:, class_idx]           # shape (n_samples,)
    binary_labels = (cal_targets == class_idx).long()

    print(f"Class {class_idx}: probs shape={class_probs.shape}, labels shape={binary_labels.shape}")
    print(f" probs dtype={class_probs.dtype}, labels dtype={binary_labels.dtype}")
    print(f" probs range=[{class_probs.min():.4f}, {class_probs.max():.4f}]")
    print(f" labels sum={int(binary_labels.sum())} out of {len(binary_labels)}")

    # require at least 2 positive samples to fit a meaningful calibrator
    if int(binary_labels.sum()) < 2:
        print(f" WARNING: Class {class_idx} has too few positive samples, skipping calibration")
        bbq_calibrators[class_idx] = None
        continue

    calibrator = BayesianBinningQuantiles(max_bins=10)
    try:
        calibrator.fit(class_probs, binary_labels)
    except Exception as e:
        print(f" ERROR fitting class {class_idx}: {e}")
        bbq_calibrators[class_idx] = None
        continue

    # guard: if fit succeeded but produced degenerate weights, skip
    if not getattr(calibrator, "is_fitted", False):
        bbq_calibrators[class_idx] = None
    else:
        bbq_calibrators[class_idx] = calibrator

# --- Step 5: collect test logits/targets
test_logits_list, test_targets_list = [], []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        outputs = net(inputs)
        test_logits_list.append(outputs.cpu())
        test_targets_list.append(targets)

test_logits = torch.cat(test_logits_list, dim=0)
test_targets = torch.cat(test_targets_list, dim=0)

# --- Step 6: convert test logits -> probabilities
test_probs = F.softmax(test_logits, dim=1)  # CPU tensor, shape (N, C)

# --- Step 7: calibrate per-class
calibrated_probs = torch.zeros_like(test_probs, dtype=torch.float32)  # CPU

for class_idx in range(n_classes):
    class_probs = test_probs[:, class_idx]  # shape (N,)
    calibrator = bbq_calibrators[class_idx]
    if calibrator is None or not getattr(calibrator, "is_fitted", False):
        calibrated_probs[:, class_idx] = class_probs
    else:
        # calibrator.predict expects a 1D tensor and returns 1D tensor
        try:
            preds = calibrator.predict(class_probs)  # returns CPU tensor
            calibrated_probs[:, class_idx] = preds
        except Exception as e:
            print(f" WARNING: prediction failed for class {class_idx}: {e}")
            calibrated_probs[:, class_idx] = class_probs

# --- Step 8: renormalize rows (guard against zero rows)
row_sums = calibrated_probs.sum(dim=1, keepdim=True)
zero_rows = (row_sums == 0).squeeze()
if zero_rows.any():
    # fallback: use original test_probs for zero-sum rows
    calibrated_probs[zero_rows] = test_probs[zero_rows]
    row_sums = calibrated_probs.sum(dim=1, keepdim=True)

calibrated_probs = calibrated_probs / row_sums

# --- Step 9: evaluate (convert to numpy)
calibrated_probs_np = calibrated_probs.numpy()
test_targets_np = test_targets.numpy()

accuracy = (np.argmax(calibrated_probs_np, axis=1) == test_targets_np).mean()
ece = expected_calibration_error(calibrated_probs_np, test_targets_np)
brier = brier_score(calibrated_probs_np, test_targets_np)

print(f"\nAccuracy after BBQ calibration: {accuracy:.4f}")
print(f"ECE after BBQ calibration: {ece:.4f}")
print(f"Brier score after BBQ calibration: {brier:.4f}")


NameError: name 'net' is not defined