In a new python environment with python>=3.10

In [None]:
!pip install "torch_uncertainty[image] @ git+https://github.com/ENSTA-U2IS-AI/torch-uncertainty@dev"

In [None]:
# here are the training parameters
batch_size = 10
learning_rate =1e-3
weight_decay=2e-4
lr_decay_epochs=20
lr_decay=0.1
nb_epochs=50

In [None]:
import torch
from einops import rearrange
from torchvision import tv_tensors
from torchvision.transforms import v2
from torchvision.transforms.v2 import functional as F

from torch_uncertainty.datasets import MUAD

train_transform = v2.Compose(
    [
        v2.Resize(size=(256, 512), antialias=True),
        v2.RandomHorizontalFlip(),
        v2.ToDtype(
            dtype={
                tv_tensors.Image: torch.float32,
                tv_tensors.Mask: torch.int64,
                "others": None,
            },
            scale=True,
        ),
        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

val_transform = v2.Compose(
    [
        v2.Resize(size=(256, 512), antialias=True),
        v2.ToDtype(
            dtype={
                tv_tensors.Image: torch.float32,
                tv_tensors.Mask: torch.int64,
                "others": None,
            },
            scale=True,
        ),
        v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ]
)

train_set = MUAD(root="./data", target_type="semantic", version="small", split="train" , transforms=train_transform, download=True)
val_set = MUAD(root="./data", target_type="semantic", version="small", split="val" , transforms=val_transform, download=True)
test_set = MUAD(root="./data", target_type="semantic", version="small", split="test" , transforms=val_transform, download=True)

Let us see the first sample of the validation set. The first image is the input and the second image is the target (ground truth).

In [None]:
sample = train_set[0]
img, tgt = sample
img.size(), tgt.size()

Visualize a validation input sample (and RGB image)

In [None]:
# Undo normalization on the image and convert to uint8.
mean = torch.tensor([0.485, 0.456, 0.406], device=img.device)
std = torch.tensor([0.229, 0.224, 0.225], device=img.device)
img = img * std[:, None, None] + mean[:, None, None]
img = F.to_dtype(img, torch.uint8, scale=True)
F.to_pil_image(img)

Visualize the same image above but segmented (our goal)

In [None]:
from torchvision.utils import draw_segmentation_masks

tmp_tgt = tgt.masked_fill(tgt == 255, 21)
tgt_masks = tmp_tgt == torch.arange(22, device=tgt.device)[:, None, None]
img_segmented = draw_segmentation_masks(img, tgt_masks, alpha=1, colors=val_set.color_palette)
F.to_pil_image(img_segmented)

Below is the complete list of classes in MUAD, presented as:

1.   Class Name
2.   Train ID
3.   Segmentation Color in RGB format [R,G, B].

In [None]:
for muad_class in train_set.classes:
    class_name = muad_class.name
    train_id = muad_class.id
    color = muad_class.color
    print(f"Class: {class_name}, Train ID: {train_id}, Color: {color}")

Here is a more comprhensive review of the diffrent classes : (while training Non-labeled data will use train ID 21 and not 255)


| **class names**                       | **ID** |
|----------------------------------------|---------|
| road                                   | 0       |
| sidewalk                               | 1       |
| building                               | 2       |
| wall                                   | 3       |
| fence                                  | 4       |
| pole                                   | 5       |
| traffic light                          | 6       |
| traffic sign                           | 7       |
| vegetation                             | 8       |
| terrain                                | 9       |
| sky                                    | 10      |
| person                                 | 11      |
| rider                                  | 12      |
| car                                    | 13      |
| truck                                  | 14      |
| bus                                    | 15      |
| train                                  | 16      |
| motorcycle                             | 17      |
| bicycle                                | 18      |
| bear deer cow                          | 19      |
| garbage_bag stand_food trash_can       | 20      |


We will feed our DNN the first raw image of the road view and as target it will be the dark image below and not the colored one (second image)

In [None]:
im = F.to_pil_image(F.to_dtype(tgt, torch.uint8))
im

In [None]:
im.size

**Why is the target image dark and what's the bright part ?** **(hint : print the numpy array)**

**answer:** The target is dark because it is not an RGB image but a class label map, where pixel values are small integers (0–21), which appear nearly black in grayscale. The bright regions correspond to pixels with value 255, which indicate ignored / unlabeled areas in the segmentation mask.

In [None]:
import numpy as np

arr = tgt.squeeze(0).cpu().numpy()
print(arr.dtype, arr.min(), arr.max())
print("unique values:", np.unique(arr)[:50], "...")
print("count of 255:", (arr == 255).sum())
print("count of 0:", (arr == 0).sum())

\**Q3/ please study the dataset a bit. What it is about?**

**answer:** The MUAD dataset is a semantic segmentation dataset for autonomous driving scenes, where each image corresponds to an urban road view and each pixel is assigned a semantic class (road, vehicle, pedestrian, sky, vegetation, etc.). It is designed to train and evaluate models that understand scene layout at pixel level, a core requirement for perception in self-driving and driver-assistance systems.


In [None]:
import numpy as np
import torch
from torch.utils.data import DataLoader

train_loader = DataLoader(
        train_set,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4)

val_loader = DataLoader(
        val_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4)

test_loader = DataLoader(
        test_set,
        batch_size=batch_size,
        shuffle=False,
        num_workers=4)


In [None]:
def enet_weighing(dataloader, num_classes, c=1.02):
    """Computes class weights as described in the ENet paper.

        w_class = 1 / (ln(c + p_class)),

    where c is usually 1.02 and p_class is the propensity score of that
    class:

        propensity_score = freq_class / total_pixels.

    References:
        https://arxiv.org/abs/1606.02147

    Args:
        dataloader (``data.Dataloader``): A data loader to iterate over the
            dataset.
        num_classes (``int``): The number of classes.
        c (``int``, optional): AN additional hyper-parameter which restricts
            the interval of values for the weights. Default: 1.02.

    """
    class_count = 0
    total = 0
    for _, label in dataloader:
      label = label.cpu().numpy()
      # Flatten label
      flat_label = label.flatten()
      flat_label = flat_label[flat_label != 255]

      # Sum up the number of pixels of each class and the total pixel
      # counts for each label
      class_count += np.bincount(flat_label, minlength=num_classes)
      total += flat_label.size

    # Compute propensity score and then the weights for each class
    propensity_score = class_count / total
    return 1 / (np.log(c + propensity_score))

In [None]:
print("\nComputing class weights...")
print("(this can take a while depending on the dataset size)")
class_weights = enet_weighing(train_loader, 19)
class_weights = torch.from_numpy(class_weights).float().cuda()
print("Class weights:", class_weights)

**Q4/ why do we need to evaluate the class_weights?**

**answer:** Because the class are unbalanced.

## C. building the DNN

**Q5/ Do we really use Unet? What did I change :)? (that is hard)**

**answer:** You add 2d dropout and bilinear instead of transposed conv.


**Q6/Do we need a backbone with Unet?**

**answer:** No. It could just help to learn faster if the domain gap is small.





In [None]:
from torch import nn


class DoubleConv(nn.Module):
    """(conv => BN => ReLU) * 2."""

    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, 3, padding=1),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        return self.conv(x)


class InConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv = DoubleConv(in_ch, out_ch)

    def forward(self, x):
        return self.conv(x)


class Down(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.mpconv = nn.Sequential(
            nn.MaxPool2d(2),
            DoubleConv(in_ch, out_ch)
        )

    def forward(self, x):
        return self.mpconv(x)


class Up(nn.Module):
    def __init__(self, in_ch, out_ch, bilinear=True):
        super().__init__()
        self.bilinear = bilinear

        self.up = nn.ConvTranspose2d(in_ch // 2, in_ch // 2, 2, stride=2)

        self.conv = DoubleConv(in_ch, out_ch)

    def forward(self, x1, x2):
        if self.bilinear:
            x1 = F.resize(x1, size=[2*x1.size()[2],2*x1.size()[3]],
                          interpolation=v2.InterpolationMode.BILINEAR)
        else:
            x1 = self.up(x1)

        # input is CHW
        diff_y = x2.size()[2] - x1.size()[2]
        diff_x = x2.size()[3] - x1.size()[3]

        x1 = F.pad(x1, [diff_x // 2, diff_x - diff_x // 2,
                        diff_y // 2, diff_y - diff_y // 2])

        # for padding issues, see
        # https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
        # https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd

        x = torch.cat([x2, x1], dim=1)
        return self.conv(x)


class OutConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, 1)

    def forward(self, x):
        return self.conv(x)

#please note that we have added dropout layer to be abble to use MC dropout

class UNet(nn.Module):
    def __init__(self, classes):
        super().__init__()
        self.inc = InConv(3, 32)
        self.down1 = Down(32, 64)
        self.down2 = Down(64, 128)
        self.down3 = Down(128, 256)
        self.down4 = Down(256, 256)
        self.up1 = Up(512, 128)
        self.up2 = Up(256, 64)
        self.up3 = Up(128, 32)
        self.up4 = Up(64, 32)
        self.dropout = nn.Dropout2d(0.1)
        self.outc = OutConv(32, classes)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x = self.up1(x5, x4)
        x = self.dropout(x)
        x = self.up2(x, x3)
        x = self.dropout(x)
        x = self.up3(x, x2)
        x = self.dropout(x)
        x = self.up4(x, x1)
        x = self.dropout(x)
        return self.outc(x)

## D. Utility functions

In [None]:
import matplotlib.pyplot as plt

# Colors from Colorbrewer Paired_12
colors = [[31, 120, 180], [51, 160, 44]]
colors = [(r / 255, g / 255, b / 255) for (r, g, b) in colors]

def plot_losses(train_history, val_history):
    x = np.arange(1, len(train_history) + 1)

    plt.figure(figsize=(8, 6))
    plt.plot(x, train_history, color=colors[0], label="Training loss", linewidth=2)
    plt.plot(x, val_history, color=colors[1], label="Validation loss", linewidth=2)
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend(loc="upper right")
    plt.title("Evolution of the training and validation loss")
    plt.show()

def plot_accu(train_history, val_history):
    x = np.arange(1, len(train_history) + 1)

    plt.figure(figsize=(8, 6))
    plt.plot(x, train_history, color=colors[0], label="Training miou", linewidth=2)
    plt.plot(x, val_history, color=colors[1], label="Validation miou", linewidth=2)
    plt.xlabel("Epoch")
    plt.ylabel("Mean IoU")
    plt.legend(loc="upper right")
    plt.title("Evolution of Miou")
    plt.show()

**Q7/  what is the IoU?**

**answer:** Intersection over Union measures the overlap between the predicted segmentation and the ground truth by dividing their intersection by their union.


### Training function

**Q8/Please complete the training and the test function**

In [None]:
from torchmetrics.utilities.compute import _safe_divide


def train( model, data_loader, optim, criterion, metric,iteration_loss=False):
    model.train()
    epoch_loss = 0.0
    metric.reset()
    for step, batch_data in enumerate(data_loader):
        # Get the inputs and labels
        img = batch_data[0].cuda()
        labels = batch_data[1].cuda()

        # Forward propagation
        outputs = model(img)

        flatten_logits = outputs.permute(0, 2, 3, 1).reshape(-1, outputs.shape[1])
        flatten_labels = labels.view(-1)
        valid_mask = flatten_labels != 255

        # Loss computation
        loss = criterion(flatten_logits[valid_mask], flatten_labels[valid_mask])

        # Backpropagation
        optim.zero_grad()
        loss.backward()
        optim.step()

        # Keep track of loss for current epoch
        epoch_loss += loss.item()

        # Keep track of the evaluation metric
        metric.update(flatten_logits[valid_mask].detach(), flatten_labels[valid_mask].detach())

        if iteration_loss:
            print("[Step: %d] Iteration loss: %.4f" % (step, loss.item()))

    # Compute IoU per class
    tp, fp, _, fn = metric._final_state()
    iou_per_class = _safe_divide(tp, tp + fp + fn, zero_division=float("nan"))

    return epoch_loss / len(data_loader), iou_per_class, metric.compute()

### Validation function

In [None]:
def test(model, data_loader, criterion, metric, iteration_loss=False):
    model.eval()
    epoch_loss = 0.0
    metric.reset()
    for step, batch_data in enumerate(data_loader):
        # Get the inputs and labels
        img = batch_data[0].cuda()
        labels = batch_data[1].cuda()

        with torch.no_grad():
            # Forward propagation
            outputs = model(img)

            flatten_logits = outputs.permute(0, 2, 3, 1).reshape(-1, outputs.shape[1])
            flatten_labels = labels.view(-1)
            valid_mask = flatten_labels != 255

            # Loss computation
            loss = criterion(flatten_logits[valid_mask], flatten_labels[valid_mask])

        # Keep track of loss for current epoch
        epoch_loss += loss.item()

        # Keep track of evaluation the metric
        metric.update(flatten_logits[valid_mask], flatten_labels[valid_mask])

        if iteration_loss:
            print("[Step: %d] Iteration loss: %.4f" % (step, loss.item()))

    # Compute IoU per class
    tp, fp, _, fn = metric._final_state()
    iou_per_class = _safe_divide(tp, tp + fp + fn, zero_division=float("nan"))

    return epoch_loss / len(data_loader), iou_per_class, metric.compute()

## E. Training Process

**Q9/ please train your DNN and comment?**



In [None]:
from torch import optim
from torch.optim import lr_scheduler

from torch_uncertainty.metrics import MeanIntersectionOverUnion

print("\nTraining...\n")
num_classes = 19
# Intialize UNet

# We are going to use the CrossEntropyLoss loss function as it's most
# frequentely used in classification problems with multiple classes which
# fits the problem. This criterion  combines LogSoftMax and NLLLoss.

model = UNet(classes=num_classes).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=255)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = lr_scheduler.StepLR(optimizer, step_size=lr_decay_epochs, gamma=lr_decay)
metric = MeanIntersectionOverUnion(num_classes=num_classes, ignore_index=255).cuda()


In [None]:
# Start Training
train_losses, val_losses = [], []
train_mious, val_mious = [], []
train_ious, val_ious = [], []

best_val_miou = -1.0
best_state = None

# used 15 epoch rather than initial nb_epochs cuz too long on my google colab...
nb_epochs = 20
for epoch in range(1, nb_epochs + 1):
    tr_loss, tr_iou_per_class, tr_miou = train(
        model, train_loader, optimizer, criterion, metric, iteration_loss=False
    )
    va_loss, va_iou_per_class, va_miou = test(
        model, val_loader, criterion, metric, iteration_loss=False
    )

    scheduler.step()

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    train_mious.append(tr_miou.item() if hasattr(tr_miou, "item") else float(tr_miou))
    val_mious.append(va_miou.item() if hasattr(va_miou, "item") else float(va_miou))
    train_ious.append(tr_iou_per_class.detach().cpu())
    val_ious.append(va_iou_per_class.detach().cpu())

    print(
        f"Epoch [{epoch:03d}/{nb_epochs:03d}] "
        f"lr={scheduler.get_last_lr()[0]:.2e} | "
        f"train: loss={tr_loss:.4f}, mIoU={train_mious[-1]:.4f} | "
        f"val: loss={va_loss:.4f}, mIoU={val_mious[-1]:.4f}"
    )

    if val_mious[-1] > best_val_miou:
        best_val_miou = val_mious[-1]
        best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}

# restore best model (optional but usually expected)
if best_state is not None:
    model.load_state_dict(best_state)

print(f"\nBest val mIoU: {best_val_miou:.4f}")

# save model
torch.save(model.state_dict(), "model2.pth")

Load a model

In [None]:
#Loading a model
model = UNet(19)
model.load_state_dict(torch.load("model1.pth"))
model = model.to("cuda")

# III. Evalution of the Trained DNN on the test

## A. classical evaluations

**Q10/ please plot the loss and miou and comment about it ?**
Both the training and test loss curves show a similar downward trend, stabilizing around 0.3. However, there's a notable difference in the mean Intersection over Union (mIoU) values: the training mIoU reaches approximately 0.75, whereas the test mIoU plateau at around 0.65. This discrepancy indicates a definite overfitting to the training dataset.

In [None]:
train_loss_history = np.array(train_losses)
val_loss_history = np.array(val_losses)
train_miou_history = np.array(train_mious)
val_miou_history = np.array(val_mious)
plot_losses(train_loss_history, val_loss_history)

In [None]:
plot_accu(train_miou_history, val_miou_history)

**Q11/ what should we have done to avoid overfitting?**

**answer:** Early stopping, data augmentation, stronger regularization / dropout

In [None]:
# Now we evaluate the model on all the test set.
loss, iou, miou = test(model, test_loader, criterion, metric)
print(">>>> [FINAL TEST on the test set: ] Avg. loss: ", loss ," | Mean IoU: ", miou)
# Print per class IoU on last epoch or if best iou
class_encoding = {c.name: c.id for c in train_set.classes if c.id < num_classes}
for key, class_iou in zip(class_encoding.keys(), iou, strict=True):
  print(f"{key}: {class_iou:.4f}")

## B. Uncertainty evaluations with MCP
Here you will just use as confidence score the Maximum class probability (MCP)


In [None]:
sample_idx = 0
img, target = test_set[sample_idx]

batch_img = img.unsqueeze(0).cuda()
batch_target = target.unsqueeze(0).cuda()
model.eval()
with torch.no_grad():
	# Forward propagation
	outputs = model(batch_img)
	outputs_proba = outputs.softmax(dim=1)
	# remove the batch dimension
	outputs_proba = outputs_proba.squeeze(0)
	confidence, pred = outputs_proba.max(0)

In [None]:
# Undo normalization on the image and convert to uint8.
mean = torch.tensor([0.485, 0.456, 0.406], device=img.device)
std = torch.tensor([0.229, 0.224, 0.225], device=img.device)
img = img * std[:, None, None] + mean[:, None, None]
img = F.to_dtype(img, torch.uint8, scale=True)

tmp_target = target.masked_fill(target == 255, 21)
target_masks = tmp_target == torch.arange(22, device=target.device)[:, None, None]
img_segmented = draw_segmentation_masks(img, target_masks, alpha=1, colors=test_set.color_palette)

pred_masks = pred == torch.arange(22, device=pred.device)[:, None, None]

pred_img = draw_segmentation_masks(img, pred_masks, alpha=1, colors=test_set.color_palette)

img = F.to_pil_image(img)
img_segmented = F.to_pil_image(img_segmented)
confidence_img = F.to_pil_image(confidence)
pred_img = F.to_pil_image(pred_img)

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 15))
ax1.imshow(img)
ax2.imshow(img_segmented)
ax3.imshow(pred_img)
ax4.imshow(confidence_img)
plt.show()

**Q12/ The last image is the related to the confidence score of the DNN. Can you explain why? What does the birght areas represent and what does the dark areas represent?**

**answer:** The confidence map shows the maximum predicted class probability per pixel (MCP); bright areas correspond to pixels where the model is very confident in its prediction (high max softmax value), while dark areas indicate uncertain regions, typically near boundaries, small objects, or confusing part.

### Now let's load the OOD test set

In [None]:
test_ood_set = MUAD(root="./data", target_type="semantic", version="small", split="ood" , transforms=val_transform, download=True)
test_ood_set

In [None]:
sample_idx = 0
img, target = test_ood_set[sample_idx]

batch_img = img.unsqueeze(0).cuda()
batch_target = target.unsqueeze(0).cuda()
model.eval()
with torch.no_grad():
	# Forward propagation
	outputs = model(batch_img)
	outputs_proba = outputs.softmax(dim=1)
	# remove the batch dimension
	outputs_proba = outputs_proba.squeeze(0)
	confidence, pred = outputs_proba.max(0)

In [None]:
# Undo normalization on the image and convert to uint8.
mean = torch.tensor([0.485, 0.456, 0.406], device=img.device)
std = torch.tensor([0.229, 0.224, 0.225], device=img.device)
img = img * std[:, None, None] + mean[:, None, None]
img = F.to_dtype(img, torch.uint8, scale=True)

tmp_target = target.masked_fill(target == 255, 21)
target_masks = tmp_target == torch.arange(22, device=target.device)[:, None, None]
img_segmented = draw_segmentation_masks(img, target_masks, alpha=1, colors=test_set.color_palette)

pred_masks = pred == torch.arange(22, device=pred.device)[:, None, None]

pred_img = draw_segmentation_masks(img, pred_masks, alpha=1, colors=test_set.color_palette)

img_pil = F.to_pil_image(img)
img_segmented = F.to_pil_image(img_segmented)
confidence_img = F.to_pil_image(confidence)
pred_img = F.to_pil_image(pred_img)

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 15))
ax1.imshow(img_pil)
ax2.imshow(img_segmented)
ax3.imshow(pred_img)
ax4.imshow(confidence_img)
plt.show()

**According to the output is the model confident when it comes to labeling the bear and goat ? How about the bench ?**

**answer:** The model seems to be as confident as other elements of same size for the bear and the goat (except around the boundary)


**Q12 bis/ The last image is the related to the confidence score of the DNN. Can you explain why?**
**Are you happy with this image?**

**answer:** The image is relativly bright because MCP shows high softmax probabilities even when their is label errors.
No, this is not good: it means the model is overconfident and not really calibrated.


## C. Uncertainty evaluations with Temperature Scaling
**Q13/ please implement a temperature scaling using torch_uncertainty**

Before Temprature scaling

In [None]:
from torch_uncertainty.metrics import CalibrationError

model.eval()
ece_metric = CalibrationError(task="binary", norm="l1").cuda()

all_confidences = []
all_correct = []

with torch.no_grad():
    for x, y in val_loader:
        x = x.cuda()
        y = y.cuda().squeeze(1)

        logits = model(x)
        probs = logits.softmax(dim=1)
        conf, pred = probs.max(1)

        mask = (y != 255) & (y < 19)
        correct = (pred == y) & mask

        all_confidences.append(conf[mask])
        all_correct.append(correct[mask])

confidences = torch.cat(all_confidences)
correct = torch.cat(all_correct)

ece_before = ece_metric(confidences, correct.float())
print("ECE before TS:", ece_before.item())


In [None]:
sample_idx = 0
img, target = test_ood_set[sample_idx]

batch_img = img.unsqueeze(0).cuda()
batch_target = target.unsqueeze(0).cuda()
model.eval()

with torch.no_grad():
    outputs = model(batch_img)
    outputs_proba = outputs.softmax(dim=1)
    outputs_proba = outputs_proba.squeeze(0)
    confidence, pred = outputs_proba.max(0)

mean = torch.tensor([0.485, 0.456, 0.406], device=img.device)
std = torch.tensor([0.229, 0.224, 0.225], device=img.device)
img = img * std[:, None, None] + mean[:, None, None]
img = F.to_dtype(img, torch.uint8, scale=True)

tmp_target = target.masked_fill(target == 255, 21)
target_masks = tmp_target == torch.arange(22, device=target.device)[:, None, None]
img_segmented = draw_segmentation_masks(img, target_masks, alpha=1, colors=test_set.color_palette)

pred_masks = pred == torch.arange(22, device=pred.device)[:, None, None]
pred_img = draw_segmentation_masks(img, pred_masks, alpha=1, colors=test_set.color_palette)

img_pil = F.to_pil_image(img)
img_segmented = F.to_pil_image(img_segmented)
confidence_img = F.to_pil_image(confidence)
pred_img = F.to_pil_image(pred_img)

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(30, 15))
ax1.imshow(img_pil)
ax2.imshow(img_segmented)
ax3.imshow(pred_img)
ax4.imshow(confidence_img)
plt.show()


**Seeing the two graphs above comment on the MCP unceratinty result, is the model overconfident or calibrated ?**

**answer:** The MCP uncertainty shows that the model tends to be overconfident before TS.

After temperature scaling

In [None]:
from torch_uncertainty.post_processing import TemperatureScaler

class SegmentationLogitLoader:
    def __init__(self, dataloader, num_classes):
        self.dataloader = dataloader
        self.num_classes = num_classes

    def __iter__(self):
        for img, labels in self.dataloader:
            img = img.cuda()
            labels = labels.squeeze(1).cuda()

            labels = torch.where(labels == 255, 0, labels)

            assert labels.max() < self.num_classes, \
                f"Label max {labels.max()} >= num_classes {self.num_classes}"

            yield img, labels

    def __len__(self):
        return len(self.dataloader)

scaler = TemperatureScaler(model=model, device="cuda")
scaler.fit(SegmentationLogitLoader(val_loader, num_classes), progress=True)
print(f"Learned temperatures: {scaler.temperature[0].item()}")


Now let's see the new confidence score image after scaling

In [None]:
with torch.no_grad():
    outputs = scaler(batch_img)
    outputs_proba = outputs.softmax(dim=1)
    outputs_proba = outputs_proba.squeeze(0)
    confidence, pred = outputs_proba.max(0)

pred_masks = pred == torch.arange(22, device=pred.device)[:, None, None]
pred_img = draw_segmentation_masks(img, pred_masks, alpha=1, colors=test_set.color_palette)

confidence_img = F.to_pil_image(confidence)
pred_img = F.to_pil_image(pred_img)

fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(24, 12))
ax1.imshow(img_pil)
ax2.imshow(pred_img)
ax3.imshow(confidence_img)
plt.show()


**Did the model get more confident ? or is it more calibrated ? Commnet on the temperature scaling graphs and results**

**answer:** The model becomes a bit less confident when classifying the bear or the goat.


## D. Uncertainty evaluations with MC Dropout

Let us implement **MC dropout**. This technique decribed in [this paper](https://arxiv.org/abs/1506.02142) allow us to have a better confindence score by using the dropout during test time.



**Q\14 Please implement MC Dropout using torch_uncertainty**

In [None]:
from torch_uncertainty.models.wrappers.mc_dropout import mc_dropout

def predict_with_mc_dropout(mc_model, image, num_estimators=20):
    """
    Perform MC Dropout inference
    Returns: mean prediction, uncertainty (variance or entropy)
    """
    with torch.no_grad():
        logits = mc_model(image)  # [num_estimators, B, C, H, W]


        if logits.dim() == 4:  # [T, C, H, W]
            logits = logits.unsqueeze(1)  # [T, 1, C, H, W]

        probs = torch.softmax(logits, dim=2)

    mean_pred = probs.mean(dim=0)  # [B, C, H, W]
    variance = probs.var(dim=0).sum(dim=1)  # [B, H, W]
    entropy = -(mean_pred * torch.log(mean_pred + 1e-10)).sum(dim=1)  # [B, H, W]

    return mean_pred, variance, entropy

results = {}

for T in [3, 20]:
    print(f"\n{'='*60}")
    print(f"MC Dropout with T={T} estimators")
    print(f"{'='*60}")

    mc_model_T = mc_dropout(
        core_model=model,
        num_estimators=T,
        last_layer=False,
        on_batch=False,
        task="segmentation"
    )
    mc_model_T.eval()
    mc_model_T.cuda()

    img, label = next(iter(val_loader))
    img = img[:1].cuda()
    label = label[:1].cuda()

    mean_pred, variance, entropy = predict_with_mc_dropout(mc_model_T, img, T)
    confidence, pred_class = mean_pred.max(dim=1)  # [1, H, W]

    # avg metrics
    avg_entropy = entropy.mean().item()
    avg_variance = variance.mean().item()
    avg_confidence = confidence.mean().item()

    results[T] = {
        'confidence': avg_confidence,
        'entropy': avg_entropy,
        'variance': avg_variance
    }

    print(f"Average confidence: {avg_confidence:.4f}")
    print(f"Average entropy: {avg_entropy:.4f}")
    print(f"Average variance: {avg_variance:.6f}")

    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.flatten()

    img_np = img[0].cpu().permute(1, 2, 0).numpy()
    mean_norm = np.array([0.485, 0.456, 0.406])
    std_norm = np.array([0.229, 0.224, 0.225])
    img_np = std_norm * img_np + mean_norm
    img_np = np.clip(img_np, 0, 1)

    axes[0].imshow(img_np)
    axes[0].set_title('Input Image')
    axes[0].axis('off')

    gt_img = label[0].cpu().squeeze().numpy()
    axes[1].imshow(gt_img, cmap='tab20', vmin=0, vmax=20)
    axes[1].set_title('Ground Truth')
    axes[1].axis('off')

    pred_img = pred_class[0].cpu().numpy()
    print(f"pred_img shape for imshow: {pred_img.shape}")  # Devrait être (H, W)
    axes[2].imshow(pred_img, cmap='tab20', vmin=0, vmax=20)
    axes[2].set_title(f'Prediction (T={T})')
    axes[2].axis('off')

    conf_img = confidence[0].cpu().numpy()
    im3 = axes[3].imshow(conf_img, cmap='hot', vmin=0, vmax=1)
    axes[3].set_title(f'Confidence\n(avg={avg_confidence:.3f})')
    axes[3].axis('off')
    plt.colorbar(im3, ax=axes[3], fraction=0.046)

    entropy_img = entropy[0].cpu().numpy()
    im4 = axes[4].imshow(entropy_img, cmap='viridis')
    axes[4].set_title(f'Uncertainty (Entropy)\n(avg={avg_entropy:.3f})')
    axes[4].axis('off')
    plt.colorbar(im4, ax=axes[4], fraction=0.046)

    variance_img = variance[0].cpu().numpy()
    im5 = axes[5].imshow(variance_img, cmap='plasma')
    axes[5].set_title(f'Variance\n(avg={avg_variance:.4f})')
    axes[5].axis('off')
    plt.colorbar(im5, ax=axes[5], fraction=0.046)

    plt.tight_layout()
    plt.savefig(f'mc_dropout_T{T}.png', dpi=150, bbox_inches='tight')
    plt.show()

print(f"\n{'='*60}")
print("COMPARISON: T=3 vs T=20")
print(f"{'='*60}")
print(f"{'Metric':<20} {'T=3':<12} {'T=20':<12} {'Change'}")
print(f"{'-'*60}")

for metric in ['confidence', 'entropy', 'variance']:
    val_3 = results[3][metric]
    val_20 = results[20][metric]
    change = ((val_20 - val_3) / val_3) * 100
    print(f"{metric.capitalize():<20} {val_3:<12.4f} {val_20:<12.4f} {change:+.2f}%")


**Try the MC dropout code with a low number of estimators T like 3 and a high number 20, Explain the diffrence seen on the confidence image, is the model getting more confident or less ?**

**answer:** With T=20, the model becomes slightly less confident compared to T=3, which is actually a good thing because more estimators give a more reliable measure of where the model is truly uncertain, especially at object boundaries and difficult areas.

## E. Uncertainty evaluations with Deep Ensembles
**Q\15 Please implement [Deep Ensembles](https://papers.nips.cc/paper/2017/file/9ef2ed4b7fd2c810847ffa5fa85bce38-Paper.pdf).**


1.   You need to train 3 DNNs and save it. (Go back to the training cell above and train and save 3 diffrent models)
2.   Use TorchUncertainty to get predictions

You have two options either train several models using the code above or use TU to train the ensemble of models in parallel.

In [None]:
from torch_uncertainty.models import deep_ensembles

saved_model_paths = ['model1.pth', 'model2.pth', 'model3.pth']
ensemble_models = []

for i, model_path in enumerate(saved_model_paths):
    model_i = UNet(num_classes)
    model_i.load_state_dict(torch.load(model_path))
    model_i.eval()
    model_i.cuda()
    ensemble_models.append(model_i)
    print(f"Loaded model {i+1} from {model_path}")

ensemble = deep_ensembles(ensemble_models, task="segmentation")
ensemble.eval()
ensemble.cuda()

print(f"\nDeep Ensemble created with {len(ensemble_models)} models")

In [None]:
import matplotlib.pyplot as plt
import numpy as np

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

img_np = img[0].cpu().permute(1, 2, 0).numpy()
mean_norm = np.array([0.485, 0.456, 0.406])
std_norm = np.array([0.229, 0.224, 0.225])
img_np = std_norm * img_np + mean_norm
img_np = np.clip(img_np, 0, 1)

axes[0].imshow(img_np)
axes[0].set_title('Input Image')
axes[0].axis('off')

axes[1].imshow(label[0].cpu().squeeze(), cmap='tab20', vmin=0, vmax=20)
axes[1].set_title('Ground Truth')
axes[1].axis('off')

axes[2].imshow(pred_class[0].cpu(), cmap='tab20', vmin=0, vmax=20)
axes[2].set_title(f'Ensemble Prediction ({len(ensemble_models)} models)')
axes[2].axis('off')

im3 = axes[3].imshow(confidence[0].cpu(), cmap='hot', vmin=0, vmax=1)
axes[3].set_title(f'Confidence (avg={confidence.mean().item():.3f})')
axes[3].axis('off')
plt.colorbar(im3, ax=axes[3], fraction=0.046)

im4 = axes[4].imshow(entropy[0].cpu(), cmap='viridis')
axes[4].set_title(f'Uncertainty/Entropy (avg={entropy.mean().item():.3f})')
axes[4].axis('off')
plt.colorbar(im4, ax=axes[4], fraction=0.046)

im5 = axes[5].imshow(variance[0].cpu(), cmap='plasma')
axes[5].set_title(f'Variance (avg={variance.mean().item():.4f})')
axes[5].axis('off')
plt.colorbar(im5, ax=axes[5], fraction=0.046)

plt.tight_layout()
plt.show()


Test your ensemble obtained either using option 1 or 2.

In [None]:
with torch.no_grad():
    img, label = next(iter(val_loader))
    img = img.cuda()
    label = label.cuda()

    ensemble_output = ensemble(img) # (num_models, batch_size, num_classes, H, W)

    mean_logits = ensemble_output.mean(dim=0)  # avg over models
    pred_class = mean_logits.argmax(dim=1)

    probs = torch.softmax(mean_logits, dim=1)
    confidence, _ = probs.max(dim=1)
    entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=1)

    # calculate variance across ensemble predictions
    all_probs = torch.softmax(ensemble_output, dim=2)  # (num_models, batch, classes, H, W)
    variance = all_probs.var(dim=0).mean(dim=1)

print(f"Prediction shape: {pred_class.shape}")
print(f"Confidence shape: {confidence.shape}")
print(f"Entropy shape: {entropy.shape}")
print(f"Variance shape: {variance.shape}")

Save the ensemble model

In [None]:
final_model_path = "ensemble.pth"
torch.save(ensemble.state_dict(), final_model_path)
print(f"Model saved to {final_model_path}")

## F. Uncertainty evaluations with Packed-Ensembles
**Q\15 Please read [Packed-Ensembles](https://arxiv.org/pdf/2210.09184). Then Implement a Packed-Ensembles Unet and train it and evaluate its Uncertainty**


In [None]:
class PackedUp(nn.Module):
    def __init__(self, in_ch, skip_ch, out_ch, pack=2, bilinear=True):
        super().__init__()
        assert out_ch % pack == 0

        self.pack = pack
        mid_ch = out_ch // pack
        self.bilinear = bilinear

        if bilinear:
            self.up = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
            self.up_proj = nn.Conv2d(in_ch, mid_ch * pack, 1, bias=False)
        else:
            self.up = nn.ConvTranspose2d(in_ch, mid_ch * pack, 2, stride=2)
            self.up_proj = nn.Identity()

        self.skip_proj = nn.Conv2d(skip_ch, mid_ch * pack, 1, bias=False)
        self.merge = nn.Conv2d(mid_ch * 2 * pack, out_ch, 1, bias=False)
        self.conv = DoubleConv(out_ch, out_ch)

    def forward(self, x, skip):
        x = self.up(x)
        x = self.up_proj(x)
        skip = self.skip_proj(skip)

        if x.shape[-2:] != skip.shape[-2:]:
            x = F.interpolate(x, size=skip.shape[-2:], mode="bilinear", align_corners=False)

        B, C, H, W = x.shape
        x = x.view(B, self.pack, C // self.pack, H, W)
        skip = skip.view(B, self.pack, C // self.pack, H, W)

        x = torch.cat([x, skip], dim=2)
        x = x.view(B, -1, H, W)
        x = self.merge(x)
        return self.conv(x)

class PackedUNet(nn.Module):
    def __init__(self, classes, base_ch=32, pack=2):
        super().__init__()

        c1 = base_ch
        c2 = c1 * 2
        c3 = c2 * 2
        c4 = c3 * 2
        c5 = c4

        self.inc = DoubleConv(3, c1)
        self.down1 = Down(c1, c2)
        self.down2 = Down(c2, c3)
        self.down3 = Down(c3, c4)
        self.down4 = Down(c4, c5)

        self.up1 = PackedUp(c5, c4, c3, pack=pack)
        self.up2 = PackedUp(c3, c3, c2, pack=pack)
        self.up3 = PackedUp(c2, c2, c1, pack=pack)
        self.up4 = PackedUp(c1, c1, c1, pack=pack)

        self.dropout = nn.Dropout2d(0.1)
        self.outc = OutConv(c1, classes)

    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)

        x = self.up1(x5, x4)
        x = self.dropout(x)
        x = self.up2(x, x3)
        x = self.dropout(x)
        x = self.up3(x, x2)
        x = self.dropout(x)
        x = self.up4(x, x1)
        x = self.dropout(x)
        return self.outc(x)

In [None]:
model = PackedUNet(classes=num_classes, pack=4).cuda()
criterion = nn.CrossEntropyLoss(weight=class_weights, ignore_index=255)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = lr_scheduler.StepLR(optimizer, step_size=lr_decay_epochs, gamma=lr_decay)
metric = MeanIntersectionOverUnion(num_classes=num_classes, ignore_index=255).cuda()

In [None]:
# Start Training
train_losses, val_losses = [], []
train_mious, val_mious = [], []
train_ious, val_ious = [], []

best_val_miou = -1.0
best_state = None

# used 15 epoch rather than initial nb_epochs cuz too long on my google colab...
nb_epochs = 20
for epoch in range(1, nb_epochs + 1):
    tr_loss, tr_iou_per_class, tr_miou = train(
        model, train_loader, optimizer, criterion, metric, iteration_loss=False
    )
    va_loss, va_iou_per_class, va_miou = test(
        model, val_loader, criterion, metric, iteration_loss=False
    )

    scheduler.step()

    train_losses.append(tr_loss)
    val_losses.append(va_loss)
    train_mious.append(tr_miou.item() if hasattr(tr_miou, "item") else float(tr_miou))
    val_mious.append(va_miou.item() if hasattr(va_miou, "item") else float(va_miou))
    train_ious.append(tr_iou_per_class.detach().cpu())
    val_ious.append(va_iou_per_class.detach().cpu())

    print(
        f"Epoch [{epoch:03d}/{nb_epochs:03d}] "
        f"lr={scheduler.get_last_lr()[0]:.2e} | "
        f"train: loss={tr_loss:.4f}, mIoU={train_mious[-1]:.4f} | "
        f"val: loss={va_loss:.4f}, mIoU={val_mious[-1]:.4f}"
    )

    if val_mious[-1] > best_val_miou:
        best_val_miou = val_mious[-1]
        best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}

# restore best model (optional but usually expected)
if best_state is not None:
    model.load_state_dict(best_state)

print(f"\nBest val mIoU: {best_val_miou:.4f}")

# save model
torch.save(model.state_dict(), "model2.pth")

In [None]:
model = PackedUNet(num_classes, pack=4)
model.load_state_dict(torch.load("model2.pth"))

In [None]:
@torch.no_grad()
def predict(model, image):
    logits = model(image)
    probs = torch.softmax(logits, dim=1)

    pred_class = probs.argmax(dim=1)
    confidence = probs.max(dim=1).values
    entropy = -(probs * torch.log(probs + 1e-10)).sum(dim=1)
    variance = probs.var(dim=1)

    return pred_class, confidence, entropy, variance


In [None]:
pred_class, confidence, entropy, variance = predict(model, img)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

img_np = img[0].cpu().permute(1, 2, 0).numpy()
mean_norm = np.array([0.485, 0.456, 0.406])
std_norm = np.array([0.229, 0.224, 0.225])
img_np = std_norm * img_np + mean_norm
img_np = np.clip(img_np, 0, 1)

axes[0].imshow(img_np)
axes[0].set_title('Input Image')
axes[0].axis('off')

axes[1].imshow(label[0].cpu().squeeze(), cmap='tab20', vmin=0, vmax=20)
axes[1].set_title('Ground Truth')
axes[1].axis('off')

axes[2].imshow(pred_class[0].cpu(), cmap='tab20', vmin=0, vmax=20)
axes[2].set_title('Prediction')
axes[2].axis('off')

im3 = axes[3].imshow(confidence[0].cpu(), cmap='hot', vmin=0, vmax=1)
axes[3].set_title(f'Confidence (avg={confidence.mean().item():.3f})')
axes[3].axis('off')
plt.colorbar(im3, ax=axes[3], fraction=0.046)

im4 = axes[4].imshow(entropy[0].cpu(), cmap='viridis')
axes[4].set_title(f'Uncertainty/Entropy (avg={entropy.mean().item():.3f})')
axes[4].axis('off')
plt.colorbar(im4, ax=axes[4], fraction=0.046)

im5 = axes[5].imshow(variance[0].cpu(), cmap='plasma')
axes[5].set_title(f'Variance (avg={variance.mean().item():.4f})')
axes[5].axis('off')
plt.colorbar(im5, ax=axes[5], fraction=0.046)

plt.tight_layout()
plt.show()


In [None]:
sample_idx = 0
img, target = test_ood_set[sample_idx]

batch_img = img.unsqueeze(0).cuda()
batch_target = target.unsqueeze(0).cuda()

pred_class, confidence, entropy, variance = predict(model, batch_img)

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

img_np = batch_img[0].cpu().permute(1, 2, 0).numpy()
mean_norm = np.array([0.485, 0.456, 0.406])
std_norm = np.array([0.229, 0.224, 0.225])
img_np = std_norm * img_np + mean_norm
img_np = np.clip(img_np, 0, 1)

axes[0].imshow(img_np)
axes[0].set_title('Input Image')
axes[0].axis('off')

axes[1].imshow(batch_target[0].cpu().squeeze(), cmap='tab20', vmin=0, vmax=20)
axes[1].set_title('Ground Truth')
axes[1].axis('off')

axes[2].imshow(pred_class[0].cpu(), cmap='tab20', vmin=0, vmax=20)
axes[2].set_title('Prediction')
axes[2].axis('off')

im3 = axes[3].imshow(confidence[0].cpu(), cmap='hot', vmin=0, vmax=1)
axes[3].set_title(f'Confidence (avg={confidence.mean().item():.3f})')
axes[3].axis('off')
plt.colorbar(im3, ax=axes[3], fraction=0.046)

im4 = axes[4].imshow(entropy[0].cpu(), cmap='viridis')
axes[4].set_title(f'Uncertainty/Entropy (avg={entropy.mean().item():.3f})')
axes[4].axis('off')
plt.colorbar(im4, ax=axes[4], fraction=0.046)

im5 = axes[5].imshow(variance[0].cpu(), cmap='plasma')
axes[5].set_title(f'Variance (avg={variance.mean().item():.4f})')
axes[5].axis('off')
plt.colorbar(im5, ax=axes[5], fraction=0.046)

plt.tight_layout()
plt.show()


**Please conclude your report**

In this lab, we tested several techniques to improve segmentation and to handle OOD data. We compared deterministic methods like deep sets and temperature scaling with stochastic approaches such as MC Dropout and packed ensembles. The packed ensemble performed the best, giving accurate segmentations and the most reliable uncertainty estimates. MC Dropout also produced useful uncertainty maps, while temperature scaling mainly helped to calibrate the predictions. Overall, our experiments show that combining model design and uncertainty-aware methods can improve segmentation and better detect OOD regions.