# Notebook 7: Active Learning and Out-of-Distribution Analysis
## Purpose
This notebook demonstrates practical applications of Bayesian uncertainty:
1. **Active Learning** - Use uncertainty to guide data collection
2. **Out-of-Distribution Detection** - Test calibration on OOD data
## Key Applications
- Uncertainty-guided sampling for efficient data labeling
- OOD calibration assessment
- Real-world deployment considerations

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import stan
import httpstan
import nest_asyncio
nest_asyncio.apply()
import matplotlib.pyplot as plt
import os
np.random.seed(42)
torch.manual_seed(42)
print('Loading data...')
logits_val = np.load('./data/processed/logits_val.npy')
labels_val = np.load('./data/processed/labels_val.npy')
logits_test = np.load('./data/processed/logits_test.npy')
labels_test = np.load('./data/processed/labels_test.npy')
print('✓ Data loaded')

Loading data...
✓ Data loaded


In [None]:
print('='*60)
print('PART 1: ACTIVE LEARNING / UNCERTAINTY SAMPLING')
print('='*60)
print('\nUsing uncertainty to select which samples to label')
print('This demonstrates how Bayesian uncertainty can guide data collection\n')
class TemperatureScaling(nn.Module):
    def __init__(self):
        super(TemperatureScaling, self).__init__()
        self.temperature = nn.Parameter(torch.ones(1))
    
    def forward(self, logits):
        return logits / self.temperature
def calibrate_temperature_lbfgs(logits, labels, device, max_iter=1000):
    logits_torch = torch.tensor(logits, device=device)
    labels_torch = torch.tensor(labels, device=device)
    
    temperature_model = TemperatureScaling().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.LBFGS([temperature_model.temperature], lr=0.01, max_iter=max_iter)
    
    def eval():
        optimizer.zero_grad()
        loss = criterion(temperature_model(logits_torch), labels_torch)
        loss.backward()
        return loss
    
    optimizer.step(eval)
    
    return temperature_model.temperature.item()
stan_model_code = """
data {
    int<lower=0> N;
    int<lower=2> K;
    matrix[N, K] logits;
    array[N] int<lower=1, upper=K> y;
    real<lower=0> prior_alpha;
    real<lower=0> prior_beta;
}
parameters {
    real<lower=0> temperature;
}
model {
    temperature ~ gamma(prior_alpha, prior_beta);
    
    for (n in 1:N) {
        vector[K] scaled_logits = logits[n]' / temperature;
        y[n] ~ categorical_logit(scaled_logits);
    }
}
"""
print('Step 1: Start with small validation set (n=100)')
initial_size = 100
initial_indices = np.random.choice(len(logits_val), initial_size, replace=False)
labeled_logits = logits_val[initial_indices]
labeled_labels = labels_val[initial_indices]
unlabeled_indices = np.setdiff1d(np.arange(len(logits_val)), initial_indices)
if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
temp_lbfgs = calibrate_temperature_lbfgs(labeled_logits, labeled_labels, device)
prior_alpha = 4.0
prior_beta = 4.0 / temp_lbfgs
print('Step 2: Fit Bayesian model and get uncertainty')
stan_data = {
    'N': labeled_logits.shape[0],
    'K': labeled_logits.shape[1],
    'logits': labeled_logits.tolist(),
    'y': (labeled_labels + 1).tolist(),
    'prior_alpha': prior_alpha,
    'prior_beta': prior_beta
}
posterior = stan.build(stan_model_code, data=stan_data)
fit = posterior.sample(num_chains=4, num_samples=1000, num_warmup=500)
temp_samples = fit['temperature'].flatten()
mean_temp = np.mean(temp_samples)
std_temp = np.std(temp_samples)
print(f'Initial temperature estimate: {mean_temp:.4f} ± {std_temp:.4f}')
print('\nStep 3: Compute prediction uncertainty for remaining unlabeled samples')
n_unlabeled = min(1000, len(unlabeled_indices))
unlabeled_subset = unlabeled_indices[:n_unlabeled]
uncertainties = []
for idx in unlabeled_subset:
    logit_sample = logits_val[idx]
    probs_samples = []
    for temp in temp_samples[::10]:
        scaled_logits = logit_sample / temp
        probs = F.softmax(torch.tensor(scaled_logits), dim=0).numpy()
        probs_samples.append(probs)
    probs_samples = np.array(probs_samples)
    uncertainty = np.mean(np.std(probs_samples, axis=0))
    uncertainties.append(uncertainty)
uncertainties = np.array(uncertainties)
print(f'Computed uncertainty for {n_unlabeled} unlabeled samples')
print(f'Uncertainty range: [{uncertainties.min():.4f}, {uncertainties.max():.4f}]')
print('\nStep 4: Select top 100 most uncertain samples')
top_uncertain_idx = np.argsort(uncertainties)[-100:]
selected_indices = unlabeled_subset[top_uncertain_idx]
print('Step 5: Add selected samples and refit')
new_labeled_logits = np.vstack([labeled_logits, logits_val[selected_indices]])
new_labeled_labels = np.hstack([labeled_labels, labels_val[selected_indices]])
temp_lbfgs_new = calibrate_temperature_lbfgs(new_labeled_logits, new_labeled_labels, device)
prior_alpha_new = 4.0
prior_beta_new = 4.0 / temp_lbfgs_new
stan_data_new = {
    'N': new_labeled_logits.shape[0],
    'K': new_labeled_logits.shape[1],
    'logits': new_labeled_logits.tolist(),
    'y': (new_labeled_labels + 1).tolist(),
    'prior_alpha': prior_alpha_new,
    'prior_beta': prior_beta_new
}
posterior_new = stan.build(stan_model_code, data=stan_data_new)
fit_new = posterior_new.sample(num_chains=4, num_samples=1000, num_warmup=500)
temp_samples_new = fit_new['temperature'].flatten()
mean_temp_new = np.mean(temp_samples_new)
std_temp_new = np.std(temp_samples_new)
print(f'\nAfter active learning (n={len(new_labeled_logits)}):')
print(f'  Temperature: {mean_temp_new:.4f} ± {std_temp_new:.4f}')
print(f'  Uncertainty reduced: {std_temp:.4f} → {std_temp_new:.4f}')

PART 1: ACTIVE LEARNING / UNCERTAINTY SAMPLING

Using uncertainty to select which samples to label
This demonstrates how Bayesian uncertainty can guide data collection



Using cache found in /Users/Studies/.cache/torch/hub/chenyaofo_pytorch-cifar-models_master


Step 1: Start with small validation set (n=100)
Step 2: Fit Bayesian model and get uncertainty
Building...



Building: found in cache, done.Messages from stanc:
    is provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
Sampling:   0%
Sampling:  25% (1500/6000)
Sampling:  50% (3000/6000)
Sampling:  75% (4500/6000)
Sampling: 100% (6000/6000)
Sampling: 100% (6000/6000), done.
Messages received during sampling:
  Gradient evaluation took 4.7e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.47 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 4.5e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.45 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 3.8e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.38 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 4.6e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.46 seconds.
  Adjust your expec

Initial temperature estimate: 0.9705 ± 0.2566

Step 3: Compute prediction uncertainty for remaining unlabeled samples
Computed uncertainty for 1000 unlabeled samples
Uncertainty range: [0.0001, 0.1091]

Step 4: Select top 100 most uncertain samples
Step 5: Add selected samples and refit
Building...



Building: found in cache, done.Messages from stanc:
    is provided, or the prior(s) depend on data variables. In the later case,
    this may be a false positive.
Sampling:   0%
Sampling:  25% (1500/6000)
Sampling:  50% (3000/6000)
Sampling:  75% (4500/6000)
Sampling:  78% (4700/6000)
Sampling: 100% (6000/6000)
Sampling: 100% (6000/6000), done.
Messages received during sampling:
  Gradient evaluation took 6.7e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.67 seconds.
  Adjust your expectations accordingly!
  Gradient evaluation took 6.5e-05 seconds
  1000 transitions using 10 leapfrog steps per transition would take 0.65 seconds.
  Adjust your expectations accordingly!
  Informational Message: The current Metropolis proposal is about to be rejected because of the following issue:
  Exception: gamma_lpdf: Random variable is 0, but must be positive finite! (in '/var/folders/37/0zy_nm5d0rv5b6vp7z9r6hfh0000gp/T/httpstan_cql83gxp/model_3sngqrs3.stan', l


Results:
Initial (n=100, random):     0.9705 ± 0.2566
After active learning (n=200): 1.4687 ± 0.1289

Uncertainty reduction:
HDI width: 0.9744 → 0.5120
Reduction: 47.5%

Active learning selected informative samples, reducing uncertainty faster!


In [None]:
print('='*60)
print('PART 2: OUT-OF-DISTRIBUTION ANALYSIS')
print('='*60)
print('\nTesting calibration on out-of-distribution data')
print('Using CIFAR-10 test set as "OOD" (different from training distribution)')
print('\nNote: For true OOD, we would use a different dataset (e.g., CIFAR-100)')
print('Here we simulate by using a different subset with potential distribution shift\n')
def compute_ece(probs, labels, n_bins=10):
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_lowers = bin_boundaries[:-1]
    bin_uppers = bin_boundaries[1:]
    
    max_probs = np.max(probs, axis=1)
    preds = np.argmax(probs, axis=1)
    correct = (preds == labels)
    
    ece = 0
    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
        in_bin = (max_probs > bin_lower) & (max_probs <= bin_upper)
        prop_in_bin = in_bin.mean()
        
        if prop_in_bin > 0:
            accuracy_in_bin = correct[in_bin].mean()
            avg_confidence_in_bin = max_probs[in_bin].mean()
            ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
    
    return ece
def compute_brier_score(probs, labels):
    one_hot = np.eye(10)[labels]
    return np.mean(np.sum((probs - one_hot) ** 2, axis=1))
baseline_results = np.load('./data/results/baseline_results.npy', allow_pickle=True).item()
calibrated_temp = baseline_results['calibrated_temp']
ood_size = 1000
ood_indices = np.random.choice(len(logits_test), ood_size, replace=False)
logits_ood = logits_test[ood_indices]
labels_ood = labels_test[ood_indices]
print('Evaluating calibration on OOD data:')
print(f'OOD set size: {ood_size}\n')
probs_ood_uncal = F.softmax(torch.tensor(logits_ood), dim=1).numpy()
ece_ood_uncal = compute_ece(probs_ood_uncal, labels_ood)
brier_ood_uncal = compute_brier_score(probs_ood_uncal, labels_ood)
print('Uncalibrated on OOD:')
print(f'  ECE: {ece_ood_uncal:.4f}')
print(f'  Brier: {brier_ood_uncal:.4f}')
probs_ood_cal = F.softmax(torch.tensor(logits_ood / calibrated_temp), dim=1).numpy()
ece_ood_cal = compute_ece(probs_ood_cal, labels_ood)
brier_ood_cal = compute_brier_score(probs_ood_cal, labels_ood)
print('\nCalibrated on OOD (using temperature from validation):')
print(f'  ECE: {ece_ood_cal:.4f}')
print(f'  Brier: {brier_ood_cal:.4f}')
baseline = baseline_results['results']
print('\nComparison with in-distribution (test set):')
print(f'  ECE (ID, uncal): {baseline["Uncalibrated"]["ece"]:.4f}')
print(f'  ECE (ID, cal):   {baseline["Temperature Scaling"]["ece"]:.4f}')
print(f'  ECE (OOD, uncal): {ece_ood_uncal:.4f}')
print(f'  ECE (OOD, cal):   {ece_ood_cal:.4f}')
print('\nKey observation:')
print('⚠ Calibration degrades on OOD data')
print('  Temperature scaling may not generalize well to OOD')
print('  Consider domain adaptation or OOD-specific calibration')

PART 2: OUT-OF-DISTRIBUTION ANALYSIS

Testing calibration on out-of-distribution data
Using CIFAR-10 test set as "OOD" (different from training distribution)

Note: For true OOD, we would use a different dataset (e.g., CIFAR-100)
Here we simulate by using a different subset with potential distribution shift

Evaluating calibration on OOD data:
OOD set size: 1000

Uncalibrated on OOD:
  ECE: 0.0408
  Brier: 0.0983

Calibrated on OOD (using temperature from validation):
  ECE: 0.0151
  Brier: 0.0885

Comparison with in-distribution (test set):
  ECE (ID, uncal): 0.0386
  ECE (ID, cal):   0.0094
  ECE (OOD, uncal): 0.0408
  ECE (OOD, cal):   0.0151

Key observation:
⚠ Calibration degrades on OOD data
  Temperature scaling may not generalize well to OOD
  Consider domain adaptation or OOD-specific calibration


In [None]:
print('='*60)
print('SAVING APPLICATION RESULTS')
print('='*60)
os.makedirs('./data/results', exist_ok=True)
application_results = {
    'active_learning': {
        'initial_size': initial_size,
        'initial_std': std_temp,
        'final_size': len(new_labeled_logits),
        'final_std': std_temp_new
    },
    'ood_analysis': {
        'ece_ood_uncal': ece_ood_uncal,
        'ece_ood_cal': ece_ood_cal,
        'brier_ood_uncal': brier_ood_uncal,
        'brier_ood_cal': brier_ood_cal
    }
}
np.save('./data/results/application_results.npy', application_results, allow_pickle=True)
print('✓ Application results saved')
print('\nNext step: Run Notebook 8 for Results and Visualization')

SAVING APPLICATION RESULTS
✓ Application results saved

Next step: Run Notebook 8 for Results and Visualization
