In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [2]:
import numpy as np
from typing import Any, Iterator
from dataclasses import dataclass
from pathlib import Path
from abc import ABC, abstractmethod
import hashlib

In [3]:
from mobilenetv2ssd.core.config import load_config

In [4]:
main_cfg_path = "configs/train/default.yaml"
model_cfg_path = "configs/model/mobilenetv2_ssd_voc.yaml"
data_cfg_path = "configs/data/voc_224.yaml"
eval_cfg_path = "configs/eval/default.yaml"

In [5]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [17]:
@dataclass
class DetectionSample:
    image: np.ndarray  # Shape: [H, W, 3]
    boxes: np.ndarray  # Shape: [N, 4]
    labels: np.ndarray # Shape: [N]
    image_id: str
    path: str
    orig_size: tuple[int,int] # (height, width)
    
    def validate(self):
        if self.image.ndim != 3 or self.image.shape[2] != 3:
            raise ValueError(f"Image must be [H,W,3], got {self.image.shape}")

        if self.image.dtype != np.float32:
            raise ValueError(f"Image must be float32, got {self.image.dtype}")

        # Checking for the Boxes
        if self.boxes.ndim != 2 or (len(self.boxes) > 0 and self.boxes.shape[1] != 4):
            raise ValueError(f"Boxes must be [N,4], got {self.boxes.shape}")

        if self.boxes.dtype != np.float32:
            raise ValueError(f"Boxes must be float32, got {self.boxes.dtype}")

        # Checking for the Labels
        if self.labels.ndim != 1:
            raise ValueError(f"Labels must be [N], got {self.labels.shape}")

        if len(self.boxes) != len(self.labels):
            raise ValueError(f"Boxes ({len(self.boxes)}) and labels ({len(self.labels)}) count mismatch")

        if self.labels.dtype != np.int32:
            raise ValueError(f"Labels must be int32, got {self.labels.dtype}")

        if len(self.labels) > 0 and np.any(self.labels < 1):
            raise ValueError(f"Labels must be >= 1 (0 is background), got min={self.labels.min()}")


    def to_dict(self):
        return {
            "image": self.image,
            "boxes": self.boxes,
            "labels": self.labels,
            "image_id": self.image_id,
            "path": self.path,
            "orig_size": np.array(self.orig_size, dtype=np.int32),
        }
        

In [18]:
class BaseDetectionDataset(ABC):
    def __init__(self,root: str | Path, split: str, classes_file: str | Path, use_difficult: bool = False, validate: bool = True):
        self.root = root
        self.split = split
        self.use_difficult = use_difficult
        self._validate = validate

        self._class_names = self._load_classes(classes_file)

        self._class_to_index = {name: i + 1 for i, name in enumerate(self._class_names)}
        self._index_to_class = {i + 1: name for i, name in enumerate(self._class_names)}
        self._index_to_class[0] = "background"

    def _load_classes(self,classes_file: str | Path):
        classes_file = Path(classes_file)

        # Checking if it exists
        if not classes_file.exists():
            raise FileNotFoundError(f"Classes file not found: {classes_file}")

        with open(classes_file, "r") as f:
            return [line.strip() for line in f if line.strip()]

    def class_names(self):
        return self._class_names

    def class_to_index(self):
        return self._class_to_index

    def class_to_index(self):
        return {name: i + 1 for i, name in enumerate(self.class_names)}

    def index_to_class(self):
        mapping = {i + 1: name for i, name in enumerate(self.class_names)}
        mapping[0] = "background"
        return mapping

    def index_to_class(self):
        return self._index_to_class

    def num_classes(self):
        return len(self._class_names) + 1

    @abstractmethod
    def __len__(self):
        raise NotImplementedError

    @abstractmethod
    def _load_sample(self, index: int):
        raise NotImplementedError

    def _clean_boxes(self, sample: DetectionSample):

        boxes = sample.boxes
        labels = sample.labels
        H,W = sample.image.shape[:2]

        # Checking for the boxes
        if len(boxes) == 0:
            return sample

        # Cleaning the boxes first
        boxes = boxes.copy()
        boxes[:, [0,2]] = np.clip(boxes[:, [0,2]], 0, W)
        boxes[:, [1,3]] = np.clip(boxes[:, [1,3]], 0, H)

        # Checking for the degenerate boxes
        widths = boxes[:, 2] - boxes[:, 0]
        heights = boxes[:, 3] - boxes[:, 1]
        valid_mask = (widths > 0) & (heights > 0)

        # Removing NaN Boxes
        finite_mask = np.all(np.isfinite(boxes), axis = 1)
        valid_mask = valid_mask & finite_mask

        return DetectionSample(
            image = sample.image,
            boxes = boxes[valid_mask],
            labels= labels[valid_mask],
            image_id=sample.image_id,
            path=sample.path,
            orig_size=sample.orig_size,
        )

    def __getitem__(self, index: int):
        if index < 0 or index >= len(self):
            raise IndexError(f"Index {index} out of range [0, {len(self)})")

        sample = self._load_sample(index)
        sample = self._clean_boxes(sample)

        if self._validate:
            sample.validate()

        return sample

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def generator(self):

        for index in range(len(self)):
            sample = self[index]
            yield sample.to_dict()

    def get_stats(self):
        total_boxes = 0
        class_counts = {name: 0 for name in self.class_names}
        
        for sample in self:
            total_boxes = total_boxes + len(sample.boxes)
            for label in sample.labels:
                class_name = class_name = self.index_to_class[label]
                class_counts[class_name] = class_counts + 1

        return {
            "num_samples": len(self),
            "total_boxes": total_boxes,
            "avg_boxes_per_image": total_boxes / len(self) if len(self) > 0 else 0,
            "class_distribution": class_counts,
        }

In [7]:
def _standardize_target(image, target : dict, index):
    # Checking for keys in the targets
        validation_keys = ['boxes','labels']
        result = all(key in target for key in validation_keys)

        if not result:
            raise KeyError("target must contain 'boxes' and 'labels'")

        # Checking if the coordinates are in the xyxy format
        boxes = target['boxes']
        labels = target['labels']

        boxes = tf.convert_to_tensor(boxes)
        labels = tf.convert_to_tensor(labels)

        if boxes.shape.rank == 1:
            n = tf.shape(boxes)[0]

            boxes = tf.cond(tf.equal(n,0), lambda: tf.reshape(boxes, [0,4]), lambda: tf.reshape(boxes, [1,4]))

        if labels.shape.rank == 0:
            labels = tf.reshape(labels,[1])      

        # Checking and enforcing dtypes
        boxes = tf.cast(boxes,tf.float32)
        labels = tf.cast(labels, tf.int32)

        target['boxes'] = boxes
        target['labels'] = labels

        # Making sure the metadata exists
        if 'image_id' not in target:
            target['image_id'] = tf.constant(f'{index}',dtype= tf.string)
        else:
            # Convert to tensor with the initial value
            image_id_tensor = tf.convert_to_tensor(target['image_id'],dtype= tf.string)
            target['image_id'] = tf.reshape(image_id_tensor,[])

        if 'hash_signature' not in target:
            target['hash_signature'] = tf.constant('',dtype= tf.string)
        else:
            hash_signature_tensor = tf.convert_to_tensor(target['hash_signature'],dtype= tf.string)
            target['hash_signature'] = tf.reshape(hash_signature_tensor,[])

        if 'orig_size' not in target:
            target['orig_size'] = tf.shape(image)[0:2]
        else:
            orig_size_tensor = tf.convert_to_tensor(target['orig_size'], dtype= tf.int32)
            target['orig_size'] = tf.reshape(orig_size_tensor,[2])

        if 'path' not in target:
            target['path'] = tf.constant("", dtype= tf.string)
        else:
            path_tensor = tf.convert_to_tensor(target['path'], dtype= tf.string)
            target['path'] = tf.reshape(path_tensor,[])

        return target

In [8]:
image = tf.zeros([480, 640, 3], dtype=tf.uint8)

target = {
    "boxes": tf.constant([
        [ 50.0,  60.0, 200.0, 300.0],
        [320.0, 100.0, 500.0, 400.0],
    ], dtype=tf.float32),
    "labels": tf.constant([7, 15], dtype=tf.int32),
    "image_id": "2007_000027",
    "path": "/datasets/VOCdevkit/VOC2007/JPEGImages/2007_000027.jpg",
}
idx = 27

I0000 00:00:1769146741.415590   17931 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1769146741.752651   17931 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1769146741.752718   17931 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1769146741.755822   17931 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1769146741.755992   17931 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [9]:
_standardize_target(image, target, idx)

{'boxes': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[ 50.,  60., 200., 300.],
        [320., 100., 500., 400.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([ 7, 15], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=string, numpy=b'2007_000027'>,
 'path': <tf.Tensor: shape=(), dtype=string, numpy=b'/datasets/VOCdevkit/VOC2007/JPEGImages/2007_000027.jpg'>,
 'hash_signature': <tf.Tensor: shape=(), dtype=string, numpy=b''>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([480, 640], dtype=int32)>}

In [10]:
image = tf.zeros([375, 500, 3], dtype=tf.uint8)

target = {
    "boxes": [
        [48, 240, 195, 371],   # ints are fine as input
        [  8,  12, 352, 150],
    ],
    "labels": [1, 14],         # should become int32 tensor
}
idx = 3

In [11]:
_standardize_target(image, target, idx)

{'boxes': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[ 48., 240., 195., 371.],
        [  8.,  12., 352., 150.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([ 1, 14], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=string, numpy=b'3'>,
 'hash_signature': <tf.Tensor: shape=(), dtype=string, numpy=b''>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([375, 500], dtype=int32)>,
 'path': <tf.Tensor: shape=(), dtype=string, numpy=b''>}

In [12]:
image = tf.zeros([224, 224, 3], dtype=tf.uint8)

target = {
    "boxes": [20, 30, 100, 180],  # WRONG SHAPE INPUT (should be [[...]])
    "labels": 5,                  # scalar label
}
idx = 99

In [13]:
_standardize_target(image, target, idx)

{'boxes': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[ 20.,  30., 100., 180.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([5], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=string, numpy=b'99'>,
 'hash_signature': <tf.Tensor: shape=(), dtype=string, numpy=b''>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([224, 224], dtype=int32)>,
 'path': <tf.Tensor: shape=(), dtype=string, numpy=b''>}

In [14]:
def _sanitize_target(image, target):
  # Check for degenerate boxes which are x2 <= x1 or y2 <= y1
    boxes = target['boxes']
    labels = target['labels']

    finite_mask = tf.reduce_all(tf.math.is_finite(boxes), axis=-1)
    boxes = tf.boolean_mask(boxes, finite_mask)
    labels = tf.boolean_mask(labels, finite_mask)
        
    # Enforcing the shapes
    boxes = tf.cast(boxes, tf.float32)
    labels = tf.cast(labels, tf.int32)

    boxes = tf.reshape(boxes, [-1,4])
    labels = tf.reshape(labels, [-1])
    
    # Clip boxes to the original dimensions
    H,W = target['orig_size']

    H = tf.cast(H, tf.float32)
    W = tf.cast(W, tf.float32)
    
    x1, y1, x2, y2 = tf.split(boxes,num_or_size_splits = 4, axis = -1)

    x1 = tf.cast(x1, tf.float32)
    y1 = tf.cast(y1, tf.float32)
    x2 = tf.cast(x2, tf.float32)
    y2 = tf.cast(y2, tf.float32)

    x1 = tf.clip_by_value(x1, 0, W)
    y1 = tf.clip_by_value(y1, 0, H)
    x2 = tf.clip_by_value(x2, 0, W)
    y2 = tf.clip_by_value(y2, 0, H)

    boxes = tf.concat([x1, y1, x2, y2], axis = -1)

    x1, y1, x2, y2 = tf.split(boxes, num_or_size_splits = 4, axis = -1)

    degenerate_validity = tf.math.logical_or(x2 <= x1, y2 <= y1)
    degenerate_validity = tf.reshape(tf.math.logical_not(degenerate_validity),[-1])

    boxes = tf.boolean_mask(boxes,degenerate_validity)

    # Filtering the labels too
    labels = tf.boolean_mask(labels, degenerate_validity)

    target['boxes'] = boxes
    target['labels'] = labels

    return target

In [15]:
image = tf.random.uniform(shape=(224, 224, 3), minval=0, maxval=255, dtype=tf.int32)
image = tf.cast(image, tf.uint8)  # typical "raw" image dtype

In [16]:
target_clean = {
    "boxes": tf.constant([
        [10.0,  20.0,  60.0,  80.0],   # valid
        [120.0, 40.0, 200.0, 210.0],   # valid
    ], dtype=tf.float32),
    "labels": tf.constant([3, 15], dtype=tf.int32),
    "image_id": tf.constant(42),
    "path": "VOCdevkit/VOC2007/JPEGImages/000042.jpg",
    "orig_size": tf.constant([224, 224], dtype=tf.int32),
}

In [17]:
_sanitize_target(image, target_clean)

{'boxes': <tf.Tensor: shape=(2, 4), dtype=float32, numpy=
 array([[ 10.,  20.,  60.,  80.],
        [120.,  40., 200., 210.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([ 3, 15], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=int32, numpy=42>,
 'path': 'VOCdevkit/VOC2007/JPEGImages/000042.jpg',
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([224, 224], dtype=int32)>}

In [18]:
target_degenerate = {
    "boxes": tf.constant([
        [10.0,  20.0,  60.0,  80.0],   # valid
        [50.0,  30.0,  40.0,  90.0],   # x2 < x1  -> invalid
        [12.0, 100.0,  40.0,  90.0],   # y2 < y1  -> invalid
        [70.0,  70.0,  70.0, 120.0],   # x2 == x1 -> zero width -> invalid
    ], dtype=tf.float32),
    "labels": tf.constant([1, 2, 3, 4], dtype=tf.int32),
    "image_id": tf.constant(7),
    "orig_size": tf.constant([224, 224], dtype=tf.int32),
}

In [19]:
_sanitize_target(image, target_degenerate)

{'boxes': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[10., 20., 60., 80.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=int32, numpy=7>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([224, 224], dtype=int32)>}

In [20]:
target_oob = {
    "boxes": tf.constant([
        [-10.0,  10.0,  50.0,  60.0],  # x1 < 0
        [ 20.0, -15.0, 100.0,  40.0],  # y1 < 0
        [150.0, 150.0, 300.0, 260.0],  # x2, y2 > bounds
        [-20.0, -20.0, 500.0, 500.0],  # huge overflow
    ], dtype=tf.float32),
    "labels": tf.constant([5, 6, 7, 8], dtype=tf.int32),
    "image_id": tf.constant(99),
    "orig_size": tf.constant([224, 224], dtype=tf.int32),
}

In [21]:
_sanitize_target(image, target_oob)

{'boxes': <tf.Tensor: shape=(4, 4), dtype=float32, numpy=
 array([[  0.,  10.,  50.,  60.],
        [ 20.,   0., 100.,  40.],
        [150., 150., 224., 224.],
        [  0.,   0., 224., 224.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(4,), dtype=int32, numpy=array([5, 6, 7, 8], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=int32, numpy=99>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([224, 224], dtype=int32)>}

In [22]:
target_nan_inf = {
    "boxes": tf.constant([
        [10.0, 20.0, 60.0, 80.0],            # valid
        [float("nan"), 5.0, 20.0, 25.0],     # NaN
        [10.0, float("inf"), 20.0, 25.0],    # Inf
    ], dtype=tf.float32),
    "labels": tf.constant([1, 2, 3], dtype=tf.int32),
    "image_id": tf.constant(555),
    "orig_size": tf.constant([224, 224], dtype=tf.int32),
}

In [23]:
_sanitize_target(image, target_nan_inf)

{'boxes': <tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[10., 20., 60., 80.]], dtype=float32)>,
 'labels': <tf.Tensor: shape=(1,), dtype=int32, numpy=array([1], dtype=int32)>,
 'image_id': <tf.Tensor: shape=(), dtype=int32, numpy=555>,
 'orig_size': <tf.Tensor: shape=(2,), dtype=int32, numpy=array([224, 224], dtype=int32)>}

In [24]:
def _validate_target(image, target):
    boxes = target['boxes']
    labels = target['labels']

    # Checking for Target boxes
    tf.debugging.assert_equal(tf.rank(boxes), 2, "boxes must be rank-2: [N,4]")
    tf.debugging.assert_equal(tf.shape(boxes)[-1], 4, "boxes last dim must be 4")
    tf.debugging.assert_equal(tf.rank(labels), 1, "labels must be rank-1: [N]")
    tf.debugging.assert_equal(tf.shape(boxes)[0], tf.shape(labels)[0], "boxes and labels must have same N")
        

    tf.debugging.assert_equal(tf.reduce_all(tf.math.is_finite(boxes)), True, "boxes contain NaN/Inf")

    x1, y1, x2, y2 = tf.split(boxes,num_or_size_splits = 4, axis = -1)

    x1 = tf.cast(x1, tf.float32)
    y1 = tf.cast(y1, tf.float32)
    x2 = tf.cast(x2, tf.float32)
    y2 = tf.cast(y2, tf.float32)

    tf.debugging.assert_less_equal(x1, x2, message= " x1 <= x2 condition violated")
    tf.debugging.assert_less_equal(y1, y2, message= " y1 <= y2 condition violated")

    # Checking if the coordinates are in the bounds of the image
    image_shape = tf.shape(image)
    
    H = tf.cast(image_shape[0], tf.float32)
    W = tf.cast(image_shape[1], tf.float32)
    
    # Checking if the coordinates are within bounds
    x1_condition = tf.math.logical_and((x1 >= 0),(x1 <= W))
    x2_condition = tf.math.logical_and((x2 >= 0),(x2 <= W))
    
    x_validity = tf.reduce_all(tf.math.logical_and(x1_condition, x2_condition ))

    y1_condition = tf.math.logical_and((y1 >= 0),(y1 <= H))
    y2_condition = tf.math.logical_and((y2 >= 0),(y2 <= H))
    
    y_validity = tf.reduce_all(tf.math.logical_and(y1_condition, y2_condition))
    
    tf.debugging.assert_equal(x_validity, tf.constant(True, tf.bool), message = "Failed to validate x conditions")
    tf.debugging.assert_equal(y_validity, tf.constant(True, tf.bool), message = "Failed to validate y conditions")

    tf.debugging.assert_type(labels, tf.int32)
    tf.debugging.assert_greater_equal(tf.reduce_min(labels), 1, "labels must be >= 1 (0 is background)")

    # Checking for the other targets attributes
    tf.debugging.assert_equal(tf.rank(target["image_id"]), 0, f"image_id must be scalar, {target['image_id']}, got rank: {tf.rank(target['image_id'])}")
    tf.debugging.assert_equal(tf.rank(target["orig_size"]), 1, "orig_size must be rank-1")
    tf.debugging.assert_equal(tf.shape(target["orig_size"])[0], 2, "orig_size must be [2] (H,W)")
    # tf.debugging.assert_less_equal(tf.reduce_max(labels),self._num_classes - 1,"label id out of range")

    # Checking the intensity values of an image
    tf.debugging.assert_equal(tf.rank(image), 3, "image must be [H,W,3]")
    tf.debugging.assert_equal(tf.shape(image)[-1], 3, "image must have 3 channels")

    tf.debugging.assert_equal(tf.rank(image),tf.constant(3, dtype = tf.int32), message = "The rank is not the same for the images")
    tf.debugging.assert_equal(tf.shape(image)[-1],tf.constant(3, dtype = tf.int32), message = "The channel dimension is invalid for image")
    tf.debugging.assert_equal(tf.math.reduce_all(tf.math.is_finite(tf.cast(image, dtype=tf.float32))), tf.constant(True, dtype= tf.bool), message = "The image intensities are not finite")
            
    # tf.debugging.assert_equal(target['path'].dtype, tf.string, "path is invalid for targets") 

In [25]:
_validate_target(image,target_clean)