## Preprocessing dataset

### Here is a complete, efficient PyTorch pipeline to convert all Cityscapes images and their corresponding .json label files into .pt files for later fast loading and training (e.g., for DeepLabv3+ fine-tuning).

In [None]:
import os
import json
from PIL import Image
import numpy as np
import torch
from tqdm import tqdm

# === Define paths ===
IMAGE_DIR = 'images/'
LABEL_DIR = 'labels/'
OUTPUT_DIR = 'pt_data/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === Import your label converter ===
# Ensure this function returns a tensor of shape [7, H, W]
from your_module import convert_label_to_multilabel_one_hot  

# === Helper: convert JSON annotation to label map ===
def parse_cityscapes_json_to_label(json_path, image_size):
    """
    Converts a Cityscapes polygon annotation JSON into a 2D label map [H, W].
    """
    from PIL import ImageDraw

    with open(json_path, 'r') as f:
        data = json.load(f)

    label_img = Image.new('L', image_size, 0)
    draw = ImageDraw.Draw(label_img)

    for obj in data['objects']:
        label = obj['label']
        polygon = obj['polygon']
        try:
            id = int(obj['id'])  # optional, depends on how IDs are stored
        except:
            id = get_class_id_from_label_name(label)  # Implement this mapping
        draw.polygon(polygon, fill=id)

    return torch.from_numpy(np.array(label_img)).long()

# === Main pipeline ===
def preprocess_and_save_all():
    image_files = [f for f in os.listdir(IMAGE_DIR) if f.endswith('.png')]

    for img_file in tqdm(image_files, desc="Processing Cityscapes data"):
        # --- Paths
        base_name = img_file.replace('_leftImg8bit.png', '')
        image_path = os.path.join(IMAGE_DIR, img_file)
        json_path = os.path.join(LABEL_DIR, base_name + '_gtFine_polygons.json')

        if not os.path.exists(json_path):
            print(f"Missing label for {img_file}")
            continue

        # --- Load image and resize if needed
        image = Image.open(image_path).convert('RGB')
        image_tensor = torch.from_numpy(np.array(image)).permute(2, 0, 1).float() / 255.0  # [3, H, W]

        # --- Convert JSON to label tensor
        label_2d = parse_cityscapes_json_to_label(json_path, image.size)  # [H, W]

        # --- Convert to multilabel [7, H, W]
        multilabel_tensor = convert_label_to_multilabel_one_hot(label_2d)

        # --- Save as .pt
        torch.save({
            'image': image_tensor,
            'label': multilabel_tensor
        }, os.path.join(OUTPUT_DIR, base_name + '.pt'))

preprocess_and_save_all()


### Suggested workflow (Deprecated)

#### 1. Load the Image

In [None]:
from PIL import Image
import torch
import numpy as np

image_path = "path/to/image.png"
image = Image.open(image_path).convert('RGB')
image_np = np.array(image)  # shape (H, W, 3)


#### 2. Parse JSON Label

Suppose each JSON contains a list of polygons with class IDs or names. Example JSON structure (pseudo):

{
  "objects": [
    {"label": "road", "polygon": [[x1, y1], [x2, y2], ...]},
    {"label": "person", "polygon": [[x1, y1], [x2, y2], ...]},
    ...
  ]
}


Load it:

In [None]:
import json

json_path = "path/to/label.json"
with open(json_path) as f:
    label_data = json.load(f)


#### 3. Initialize an Empty Label Mask

In [None]:
height, width = image_np.shape[:2]
label_mask = np.zeros((height, width), dtype=np.uint8)  # or int64 if needed


#### 4. Rasterize Polygons into Label Mask

In [None]:
import cv2

# A mapping from label names to original Cityscapes IDs, as in your CLASS_MAPPING keys
LABEL_NAME_TO_ID = {
    "road": 7,
    "sidewalk": 8,
    "building": 11,
    "wall": 12,
    "fence": 13,
    "pole": 17,
    "traffic sign": 19,
    "traffic light": 20,
    "vegetation": 21,
    "terrain": 22,
    "sky": 23,
    "person": 24,
    "rider": 25,
    "car": 26,
    "truck": 27,
    "bus": 28,
    "train": 31,
    "motorcycle": 32,
    "bicycle": 33,
}

for obj in label_data["objects"]:
    label_name = obj["label"]
    polygon = np.array(obj["polygon"], dtype=np.int32)

    if label_name in LABEL_NAME_TO_ID:
        class_id = LABEL_NAME_TO_ID[label_name]

        # cv2.fillPoly expects a list of polygons (each polygon a numpy array)
        cv2.fillPoly(label_mask, [polygon], class_id)


#### 5. Convert Label Mask to PyTorch Tensor

In [None]:
label_tensor = torch.from_numpy(label_mask).long()

#### 6. Use Your Existing Function

In [None]:
multilabel_one_hot = convert_label_to_multilabel_one_hot(label_tensor)
# multilabel_one_hot has shape [7, H, W]

### Dataset Loader for Preprocessed Tensors

In [None]:
from torch.utils.data import Dataset
import torch
import os

class PrecomputedCityscapesDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.files = [f for f in os.listdir(root_dir) if f.endswith(".pt")]

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = os.path.join(self.root_dir, self.files[idx])
        sample = torch.load(path)
        return sample['image'], sample['label']


## Image Preprocessing Pipeline for Inference 

This script assumes the final layer of your fine-tuned DeepLabV3+ produces 7 channels (for each macro class + object).

In [None]:
import torch
from torchvision import transforms
from torchvision.models.segmentation import deeplabv3_resnet50
from PIL import Image
import matplotlib.pyplot as plt
import os

# === CONFIGURATION ===
NUM_CLASSES = 7
CLASS_NAMES = ["road", "flat", "human", "vehicle", "construction", "background", "object"]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "model_finetuned.pt"  # path to your saved model
IMAGE_PATH = "path_to_image.jpg"  # path to the image you want to infer
IMAGE_SIZE = (512, 1024)  # resize your image as needed


# === MODEL SETUP ===
def load_model(model_path):
    model = deeplabv3_resnet50(pretrained=False, num_classes=NUM_CLASSES)
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
    model.to(DEVICE)
    model.eval()
    return model


# === IMAGE PREPROCESSING ===
def preprocess_image(image_path, image_size=None):
    image = Image.open(image_path).convert('RGB')

    transform_list = []
    if image_size:
        transform_list.append(transforms.Resize(image_size))

    transform_list.extend([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225]),
    ])

    preprocess = transforms.Compose(transform_list)
    image_tensor = preprocess(image).unsqueeze(0)  # Add batch dimension

    return image_tensor, image


# === INFERENCE ===
def infer(model, image_tensor):
    with torch.no_grad():
        image_tensor = image_tensor.to(DEVICE)
        output = model(image_tensor)['out']  # [1, 7, H, W]
        probs = torch.sigmoid(output)        # sigmoid for multi-label
        prediction = (probs > 0.5).float()   # threshold to binary mask
    return prediction.squeeze(0).cpu()  # [7, H, W]


# === VISUALIZATION ===
def visualize_prediction(prediction_tensor, original_image, class_names):
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()

    for i in range(NUM_CLASSES):
        axes[i].imshow(prediction_tensor[i], cmap='gray')
        axes[i].set_title(class_names[i])
        axes[i].axis('off')

    axes[-1].imshow(original_image)
    axes[-1].set_title("Original Image")
    axes[-1].axis('off')

    plt.tight_layout()
    plt.show()


# === MAIN ===
def main():
    assert os.path.exists(MODEL_PATH), f"Model not found at {MODEL_PATH}"
    assert os.path.exists(IMAGE_PATH), f"Image not found at {IMAGE_PATH}"

    print("Loading model...")
    model = load_model(MODEL_PATH)

    print("Preprocessing image...")
    img_tensor, orig_img = preprocess_image(IMAGE_PATH, image_size=IMAGE_SIZE)

    print("Running inference...")
    prediction = infer(model, img_tensor)

    print("Visualizing result...")
    visualize_prediction(prediction, orig_img, CLASS_NAMES)


if __name__ == "__main__":
    main()


## Gropued classes

In [None]:
import torch
import numpy as np

# Macro class index mapping
MACRO_CLASSES = {
    "road": 0,
    "flat": 1,
    "human": 2,
    "vehicle": 3,
    "construction": 4,
    "background": 5,
    "object": 6,  # auxiliary objectness channel
}

# Map from original label ID to (macro class or None, is_object)   [None is only for the poles and traffic signs and lights]
CLASS_MAPPING = {
    7: ("road", False), # road
    8: ("flat", False), # sidewalk
    11: ("construction", False), # building
    12: ("construction", False), # wall
    13: ("construction", False), # fence
    17: (None, True),  # pole
    19: (None, True),  # traffic sign
    20: (None, True),  # traffic light
    21: ("background", False), # vegetation
    22: ("flat", False), # terrain
    23: ("background", False), # sky
    24: ("human", True), # person
    25: ("human", True), # rider
    26: ("vehicle", True), # car
    27: ("vehicle", True), # truck
    28: ("vehicle", True), # bus
    31: ("vehicle", True), # train
    32: ("vehicle", True), # motorcycle
    33: ("vehicle", True), # bicycle
}

# Prepare a mapping from original labels to macro class index (0 to 6)
# For original labels mapped to None macro class (like poles, signs, lights), only 'object' class (6) will be set.

LABEL_TO_MACRO_IDX = {}

for original_id, (macro_class, is_object) in CLASS_MAPPING.items():
    if macro_class is not None:
        LABEL_TO_MACRO_IDX[original_id] = MACRO_CLASSES[macro_class]
    else:
        # For None macro class, we don't assign a macro_idx (only object channel will be set)
        LABEL_TO_MACRO_IDX[original_id] = None


def convert_label_to_multilabel_one_hot(label: torch.Tensor) -> torch.Tensor:
    """
    Converts label [H, W] with Cityscapes original IDs into a multi-label one-hot encoding tensor [7, H, W].
    The last channel (index 6) corresponds to the 'object' auxiliary channel.
    """
    height, width = label.shape
    multilabel = torch.zeros((7, height, width), dtype=torch.float32)

    for original_id, (_, is_object) in CLASS_MAPPING.items():
        mask = (label == original_id)
        macro_idx = LABEL_TO_MACRO_IDX[original_id]

        if macro_idx is not None:
            multilabel[macro_idx][mask] = 1.0

        if is_object:
            multilabel[MACRO_CLASSES["object"]][mask] = 1.0

    return multilabel


### Visualization of the labels

In [None]:
import matplotlib.pyplot as plt

class_names_7 = ["road", "flat", "human", "vehicle", "construction", "background", "object"]

def visualize_one_hot_vertical(one_hot, class_names=None, max_classes=7):
    num_classes = min(one_hot.shape[0], max_classes)
    fig, axes = plt.subplots(num_classes, 1, figsize=(5, 3 * num_classes))

    for i in range(num_classes):
        ax = axes[i]
        ax.imshow(one_hot[i], cmap='gray')
        title = f"Class {i}" if class_names is None else class_names[i]
        ax.set_title(title)
        ax.axis('off')

    plt.tight_layout()
    plt.show()


### Usage example

In [None]:
label = ...  # your input tensor [H, W] with original Cityscapes label IDs

multilabel = convert_label_to_multilabel_one_hot(label)  # shape [7, H, W]

# Convert to numpy for visualization
multilabel_np = multilabel.numpy()

visualize_one_hot_vertical(multilabel_np, class_names_7)


## BCEWithLogitsLoss (Boundary-aware Binary Cross Entropy Loss)

The standard **BCEWithLogitsLoss** treats all pixels equally, which can be suboptimal when:

-The object boundaries are thin and imbalanced in area.

-You want to detect unknown objects that are often isolated or occluded, making boundaries key discriminators.

A **Boundary-Aware BCE Loss** adds extra weight to boundary pixels, improving:

-Edge delineation

-Segmentation of small or novel objects

-Generalization in open-set recognition

### Boundary-Aware Binary Cross-Entropy (BCE) Loss

Let $P(x) = \sigma(f(x))$ be the predicted probability at pixel $x$, where $f(x)$ is the raw logit output and $\sigma(\cdot)$ is the sigmoid activation. Let $Y(x) \in \{0, 1\}$ be the ground truth label, and $w(x)$ be a boundary-based weight.

The **Boundary-Aware BCE Loss** is defined as:

$$
\mathcal{L}_{\text{BCE}}^{\text{boundary}} = -\sum_{x} w(x) \left[ Y(x) \cdot \log(P(x)) + (1 - Y(x)) \cdot \log(1 - P(x)) \right]
$$

where:

- $w(x) > 1$ for pixels near object boundaries,  
- $w(x) = 1$ for all other pixels,  
- and $P(x) = \frac{1}{1 + e^{-f(x)}}$ is the sigmoid of the network output.



Implementation:

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class BoundaryAwareBCELoss(nn.Module):
    def __init__(self, edge_weight=5.0, dilation=3, use_sobel=True):
        """
        Args:
            edge_weight (float): weight multiplier for boundary pixels.
            dilation (int): how much to dilate boundary pixels.
            use_sobel (bool): whether to compute boundaries using Sobel filter.
        """
        super(BoundaryAwareBCELoss, self).__init__()
        self.edge_weight = edge_weight
        self.dilation = dilation
        self.use_sobel = use_sobel
        self.bce = nn.BCEWithLogitsLoss(reduction='none')

    def forward(self, logits, targets):
        """
        Args:
            logits: (B, 1, H, W) or (B, C, H, W) raw outputs from the model
            targets: (B, 1, H, W) or (B, C, H, W) ground truth binary masks
        Returns:
            Scalar boundary-aware BCE loss
        """
        assert logits.shape == targets.shape, "Shape mismatch between logits and targets"

        with torch.no_grad():
            weights = self.compute_boundary_weights(targets)

        loss = self.bce(logits, targets)
        weighted_loss = loss * weights
        return weighted_loss.mean()

    def compute_boundary_weights(self, masks):
        """
        Computes a per-pixel weight map based on boundary proximity.
        Args:
            masks: (B, C, H, W) binary ground truth masks
        Returns:
            weights: (B, C, H, W) tensor of pixel-wise weights
        """
        if self.use_sobel:
            edge = self.sobel_edges(masks)
        else:
            edge = self.laplacian_edges(masks)

        # Optional dilation to expand boundary width
        edge = F.max_pool2d(edge, kernel_size=self.dilation, stride=1, padding=self.dilation // 2)
        weights = torch.ones_like(masks)
        weights[edge > 0] = self.edge_weight
        return weights

    def sobel_edges(self, masks):
        """
        Computes edges using Sobel operator.
        """
        kernel_x = torch.tensor([[-1, 0, 1],
                                 [-2, 0, 2],
                                 [-1, 0, 1]], dtype=torch.float32, device=masks.device).view(1, 1, 3, 3)

        kernel_y = torch.tensor([[-1, -2, -1],
                                 [ 0,  0,  0],
                                 [ 1,  2,  1]], dtype=torch.float32, device=masks.device).view(1, 1, 3, 3)

        edge = torch.zeros_like(masks)
        for c in range(masks.shape[1]):
            gx = F.conv2d(masks[:, c:c+1], kernel_x, padding=1)
            gy = F.conv2d(masks[:, c:c+1], kernel_y, padding=1)
            edge[:, c:c+1] = (gx**2 + gy**2).sqrt()
        return (edge > 0).float()

    def laplacian_edges(self, masks):
        """
        Alternative: computes edges using Laplacian operator.
        """
        kernel = torch.tensor([[0, 1, 0],
                               [1, -4, 1],
                               [0, 1, 0]], dtype=torch.float32, device=masks.device).view(1, 1, 3, 3)
        edge = torch.zeros_like(masks)
        for c in range(masks.shape[1]):
            e = F.conv2d(masks[:, c:c+1], kernel, padding=1).abs()
            edge[:, c:c+1] = (e > 0).float()
        return edge


### Example Usage

In [None]:
loss_fn = BoundaryAwareBCELoss(edge_weight=5.0)

# Simulated logits and targets
logits = torch.randn((2, 1, 256, 256), requires_grad=True)  # raw outputs
targets = torch.randint(0, 2, (2, 1, 256, 256)).float()      # binary masks

loss = loss_fn(logits, targets)
loss.backward()
