In [1]:
import torch, json
from src.config import CFG
from src.data import make_loaders
from src.model import build_model
from src.calibration import fit_temperature
from src.metrics import evaluate, expected_calibration_error

cfg = CFG()
device = "cuda" if torch.cuda.is_available() else "cpu"
train_loader, val_loader, test_loader = make_loaders(cfg)

# Load your best checkpoint (e.g., groupdro)
model = build_model(cfg.model_name, num_classes=9).to(device)
model.load_state_dict(torch.load("models/model_groupdro.pt", map_location=device))
scaler = fit_temperature(model, val_loader, device=device)

@torch.no_grad()
def evaluate_with_temp(model, loader, scaler, device="cpu"):
    model.eval()
    y_all, p_all, conf_all, races, genders = [], [], [], [], []
    for x,y,r,g in loader:
        x=x.to(device); y=y.to(device)
        logits = model(x)
        logits = scaler(logits)     # apply temperature
        probs = torch.softmax(logits, dim=1)
        preds = probs.argmax(1)
        y_all.extend(y.cpu().tolist())
        p_all.extend(preds.cpu().tolist())
        conf_all.extend(probs.max(1).values.cpu().tolist())
        races.extend(r.tolist()); genders.extend(g.tolist())
    from sklearn.metrics import accuracy_score
    overall_acc = accuracy_score(y_all, p_all)
    ece = expected_calibration_error(y_all, p_all, conf_all, n_bins=15)
    return overall_acc, ece

acc, ece = evaluate_with_temp(model, test_loader, scaler, device)
print("Calibrated ACC:", acc, "Calibrated ECE:", ece)

torch.save(scaler.state_dict(), "models/temperature_scaler.pt")


Consider using tensor.detach() first. (Triggered internally at /pytorch/torch/csrc/autograd/generated/python_variable_methods.cpp:835.)
  loss = float(closure())


Calibrated ACC: 0.5448238086543729 Calibrated ECE: 0.017760192349985708
