# BDD100K Semantic Segmentation - U-Net + EfficientNet-B3

**Context:** Comparative study of semantic segmentation architectures for autonomous driving

## Table of Contents
1. Setup & Installation
2. Imports
3. Configuration
4. Data Loading
5. Model Architecture
6. Training
7. Evaluation
8. Visualization

##  Imports

Import all required libraries.

In [None]:
# Core PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast, GradScaler
from torch.optim.lr_scheduler import ReduceLROnPlateau

# Data augmentation
import albumentations as A
from albumentations.pytorch import ToTensorV2

# Segmentation model
import segmentation_models_pytorch as smp
from segmentation_models_pytorch.losses import FocalLoss
from segmentation_models_pytorch.losses import DiceLoss

# Utils
from tqdm import tqdm
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import os

ModuleNotFoundError: No module named 'segmentation_models_pytorch'

## Configuration

Define class mappings and hyperparameters.

In [None]:

# BDD100K semantic classes

classes = {
    0: 'road',
    1: 'sidewalk',
    2: 'building',
    3: 'wall',
    4: 'fence',
    5: 'pole',
    6: 'traffic light',
    7: 'traffic sign',
    8: 'vegetation',
    9: 'terrain',
    10: 'sky',
    11: 'person',
    12: 'rider',
    13: 'car',
    14: 'truck',
    15: 'bus',
    16: 'train',
    17: 'motorcycle',
    18: 'bicycle'
}

class_names = list(classes.values())
class_mapping = {}

for new_id, (original_id, name) in enumerate(classes.items()):
    class_mapping[original_id] = new_id






In [None]:
#Hyperparameters
num_classes=len(classes)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
image_size = 640
batch_size=8
ignore_index=255


## Data Transforms

Define augmentation pipelines for training and validation.

In [None]:
train_transform=A.Compose([
    A.Resize(image_size,image_size),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.3),
    A.GaussNoise(p=0.2),
    A.Blur(blur_limit=3, p=0.2),
    A.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]),
    ToTensorV2()
])

val_transform=A.Compose([
    A.Resize(image_size,image_size),
    A.Normalize(mean=[0.485,0.456,0.406],std=[0.229,0.224,0.225]),
    ToTensorV2()

])

## Dataset Preparation


Organize your dataset as follows:
```
data/
├── images/
│   ├── train/
│   └── val/
└── labels/
    ├── train/
    └── val/
```

**Set your dataset path:**
- Local: `DATA_ROOT = "data"`
- Kaggle: `DATA_ROOT = "/kaggle/input/bdd10k"`
- Colab: `DATA_ROOT = "/content/drive/MyDrive/bdd10k"`


In [None]:

class BDDDataset(Dataset):
    def __init__(self, images_dir, masks_dir, transform=None):
        self.images_dir = images_dir
        self.masks_dir = masks_dir
        self.images = sorted(os.listdir(images_dir))
        self.masks = sorted(os.listdir(masks_dir))
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def remap_mask_numpy(self, mask):

        remapped = np.full_like(mask, ignore_index, dtype=np.int64)
        for original_id, new_id in class_mapping.items():
            remapped[mask == original_id] = new_id
        return remapped

    def __getitem__(self, idx):
        img_path = os.path.join(self.images_dir, self.images[idx])
        mask_path = os.path.join(self.masks_dir, self.masks[idx])
        image = Image.open(img_path).convert("RGB")
        mask = Image.open(mask_path)
        mask = mask.resize(image.size, Image.NEAREST)
        image = np.array(image)
        mask = np.array(mask)
        mask = self.remap_mask_numpy(mask)
        if self.transform:
            transformed = self.transform(image=image, mask=mask)
            image = transformed['image']
            mask = transformed['mask'].long()

        return image, mask

# Configuration paths
DATA_ROOT = "data"  # Change this to your dataset path

train_dataset = BDDDataset(
    images_dir=f"{DATA_ROOT}/images/train",
    masks_dir=f"{DATA_ROOT}/labels/train",
    transform=train_transform
)

val_dataset = BDDDataset(
    images_dir=f"{DATA_ROOT}/images/val",
    masks_dir=f"{DATA_ROOT}/labels/val",
    transform=val_transform
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

##  Loss Function

Compute class weights and define combined loss.

In [None]:

def compute_class_weights(dataloader, num_classes):
    pixel_counts = torch.zeros(num_classes)

    for images, masks in tqdm(dataloader, desc="Computing class weights"):
        masks = masks.long()
        for c in range(num_classes):
            pixel_counts[c] += (masks == c).sum()

    weights = 1.0 / (pixel_counts + 1)
    weights = weights / weights.sum() * num_classes
    return weights

class_weights = compute_class_weights(train_loader, num_classes)
class_weights_tensor = class_weights.to(device)

class CombinedLoss(nn.Module):
    def __init__(self, class_weights_tensor, ignore_index=255):
        super().__init__()


        self.focal_loss = FocalLoss(
            mode='multiclass',
            alpha=None,
            gamma=2.5,
            ignore_index=ignore_index
        )


        self.dice_loss = DiceLoss(
            mode='multiclass',
            ignore_index=ignore_index
        )


        self.ce_loss = nn.CrossEntropyLoss(
            weight=class_weights_tensor,
            ignore_index=ignore_index
        )

    def forward(self, outputs, targets):
        focal = self.focal_loss(outputs, targets)
        dice = self.dice_loss(outputs, targets)
        ce = self.ce_loss(outputs, targets)

        return 0.5 * focal + 0.25 * dice + 0.25 * ce





##  Model Architecture

U-Net with EfficientNet-B3 encoder (pretrained on ImageNet).

In [None]:
model=smp.Unet(
    encoder_name="efficientnet-b3",
    encoder_weights="imagenet",
    in_channels=3,
    classes=num_classes,
).to(device)


criterion = CombinedLoss(class_weights_tensor, ignore_index=255)

##  Training - Phase 1

 Freeze encoder, train decoder only.

In [None]:
for param in model.encoder.parameters():
  param.requires_grad= False

optimizer = optim.Adam([
    {'params': model.decoder.parameters(), 'lr': 1e-3},
    {'params': model.segmentation_head.parameters(), 'lr': 1e-3}
])

In [None]:


# Training Configuration
num_epochs = 50
train_losses = []
val_losses = []
best_val_loss = float('inf')
patience = 6
patience_counter = 0
scaler = GradScaler('cuda')

scheduler = ReduceLROnPlateau(optimizer, mode='min',factor=0.5,patience=3,min_lr=1e-6,verbose=True)

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for (images, masks) in progress_bar:
        images = images.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()

        with autocast(device_type='cuda'):
            outputs = model(images)
            loss = criterion(outputs, masks)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, masks in tqdm(val_loader, desc="Validation", leave=False):
            images = images.to(device)
            masks = masks.to(device)
            outputs = model(images)
            loss = criterion(outputs, masks)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        improvement = best_val_loss - val_loss
        best_val_loss = val_loss
        patience_counter = 0
        print(f"✓ Best model! Val: {val_loss:.4f} (↓ {improvement:.4f})")
    else:
        patience_counter += 1
        print(f"  Epoch {epoch+1}/{num_epochs} | Train: {train_loss:.4f} | Val: {val_loss:.4f} | Patience: [{patience_counter}/{patience}]")

        if patience_counter >= patience:
            print(f"\n⏸ Early stopping! No improvement for {patience} epochs")
            break

    scheduler.step(val_loss)

print(f"\n Phase 1 complete! Best val loss: {best_val_loss:.4f}")




##  Fine-tuning - Phase 2

Strategy: Unfreeze encoder and fine-tune entire model.

In [None]:
for param in model.encoder.parameters():
    param.requires_grad = True

optimizer = torch.optim.Adam([
    {'params': model.encoder.parameters(), 'lr': 1e-5},
    {'params': model.decoder.parameters(), 'lr': 5e-5},
    {'params': model.segmentation_head.parameters(), 'lr': 5e-5}
])

In [None]:

scheduler = ReduceLROnPlateau(optimizer,mode='min',factor=0.5,patience=3,min_lr=1e-7,verbose=True)
start_epoch = len(train_losses)
num_epochs = start_epoch + 30
patience = 6
patience_counter = 0
scaler = GradScaler('cuda')

for epoch in range(start_epoch, num_epochs):
    # Training
    model.train()
    train_loss = 0.0

    for images, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images = images.to(device)
        masks = masks.to(device)

        optimizer.zero_grad()

        with autocast(device_type='cuda'):
            outputs = model(images)
            loss = criterion(outputs, masks)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item()

    train_loss /= len(train_loader)
    train_losses.append(train_loss)

    # Validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for images, masks in val_loader:
            images = images.to(device)
            masks = masks.to(device)
            outputs = model(images)
            loss = criterion(outputs, masks)
            val_loss += loss.item()

    val_loss /= len(val_loader)
    val_losses.append(val_loss)

    # Early stopping
    if val_loss < best_val_loss:
        improvement = best_val_loss - val_loss
        best_val_loss = val_loss
        patience_counter = 0
        print(f"✓ Best model! Val: {val_loss:.4f} (↓ {improvement:.4f})")
    else:
        patience_counter += 1
        print(f"  Epoch {epoch+1}/{num_epochs} | Train: {train_loss:.4f} | Val: {val_loss:.4f} | Patience: [{patience_counter}/{patience}]")

        if patience_counter >= patience:
            print(f"\n⏸ Early stopping! No improvement for {patience} epochs")
            break

    scheduler.step(val_loss)

print(f"\n Fine-tuning complete! Best val loss: {best_val_loss:.4f}")

##  Training Curves

In [None]:
plt.figure(figsize=(10,5))
plt.plot(train_losses,label='Train Loss')
plt.plot(val_losses,label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()



##  Evaluation

Calculate IoU per class and overall accuracy.

In [None]:
def calculate_accuracy(model,val_loader,device):
  model.eval()
  correct=0
  total=0
  with torch.no_grad():
    for images,masks in val_loader:
      images=images.to(device)
      masks = masks.long().to(device)

      outputs=model(images)
      preds=torch.argmax(outputs,dim=1)

      valid_mask = masks !=255
      correct +=((preds==masks)& valid_mask).sum().item()
      total +=valid_mask.sum().item()
  return correct/total *100

def calculate_iou_per_class(model, val_loader,device,num_classes=num_classes):
  model.eval()
  iou_per_class=[]

  for class_id in range(num_classes):
    intersection=0
    union=0

    with torch.no_grad():
      for images,masks in val_loader:
        images=images.to(device)
        masks = masks.long().to(device)

        outputs=model(images)
        preds=torch.argmax(outputs,dim=1)

        pred_class=(preds==class_id)
        true_class=(masks==class_id)

        intersection += (pred_class & true_class).sum().item()
        union += (pred_class | true_class).sum().item()

    iou=intersection/union if union>0 else 0
    iou_per_class.append(iou)

    print(f"{class_names[class_id]:12}: IoU={iou:.3f}")
  mean_iou=sum(iou_per_class)/len(iou_per_class)
  print(f"\n Mean IoU: {mean_iou:.3f}")
  return iou_per_class





In [None]:
acc=calculate_accuracy(model, val_loader, device)
iou=calculate_iou_per_class(model, val_loader, device)

##  Visualizations

Show model predictions vs ground truth.

In [None]:
def denormalize(tensor):
    mean = torch.tensor([0.485, 0.456, 0.406]).view(3, 1, 1)
    std = torch.tensor([0.229, 0.224, 0.225]).view(3, 1, 1)
    return (tensor * std + mean).clamp(0, 1)




def show_predictions(model,dataset, indices=[20,25,125]):
  model.eval()
  fig,axes = plt.subplots(len(indices),3,figsize=(12,4*len(indices)))

  for i, idx in enumerate(indices):
    image,mask=dataset[idx]

    with torch.no_grad():
      pred = model(image.unsqueeze(0).to(device))
      pred=torch.argmax(pred,dim=1).squeeze().cpu()

      img_denorm= denormalize(image).permute(1,2,0).cpu().numpy()

      axes[i,0].imshow(img_denorm)
      axes[i,0].set_title("Original Image")
      axes[i,0].axis('off')

      axes[i,1].imshow(mask,cmap='tab20',vmin=0,vmax=18)
      axes[i,1].set_title("Ground Truth")
      axes[i,1].axis('off')

      axes[i, 2].imshow(pred, cmap='tab20', vmin=0, vmax=18)
      axes[i, 2].set_title("Model Prediction")
      axes[i, 2].axis('off')

  plt.tight_layout()
  plt.show()

show_predictions(model,val_dataset)