In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [2]:
import tensorflow as tf
from typing import Any
from pathlib import Path
import hashlib
import json
import xml.etree.ElementTree as ET

2026-01-13 22:35:27.520371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2026-01-13 22:35:27.540239: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2026-01-13 22:35:27.545827: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2026-01-13 22:35:27.562080: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from mobilenetv2ssd.core.config import load_config

In [4]:
main_cfg_path = "configs/train/default.yaml"
model_cfg_path = "configs/model/mobilenetv2_ssd_voc.yaml"
data_cfg_path = "configs/data/voc_224.yaml"
eval_cfg_path = "configs/eval/default.yaml"

In [5]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [6]:
config['data']

{'dataset_name': 'voc',
 'root': '/mnt/d/dev/MobileNetV2-SSD/datasets/VOCdevkit/VOC2012',
 'train_split': 'trainval',
 'val_split': 'val',
 'input_size': [224, 224],
 'num_workers': 4,
 'shuffle_buffer': 1000,
 'prefetch_batches': 2,
 'standardize': {'boxes_format_in': 'xyxy_pixels',
  'boxes_format_out': 'xyxy_pixels',
  'image_rgb': True,
  'to_float32': True,
  'scale': '0_1'},
 'preprocess': {'standardize_pipeline': ['to_float32', 'scale_01'],
  'pipeline': ['resize', 'sanitize_boxes', 'normalize'],
  'params': {'resize': {'enabled': True,
    'size': [300, 300],
    'mode': 'stretch',
    'interp': 'bilinear'},
   'sanitize_boxes': {'enabled': True,
    'clip': True,
    'min_size': 1,
    'min_size_mode': 'pixels'},
   'normalize': {'enabled': True,
    'mean': [0.485, 0.456, 0.406],
    'std': [0.229, 0.224, 0.225]}}},
 'augment': {'enabled': True,
  'output_box_norm': True,
  'pipeline': ['photometric_distort',
   'random_expand',
   'random_iou_crop',
   'random_flip'],
  'par

In [7]:
from datasets.base import BaseDetectionDataset

In [8]:
class VOCDataset(BaseDetectionDataset):
    def __init__(self, config: dict[str,Any], split: str, transform = None):
        super().__init__(config, split, transform)

        self._root = Path(config['data']['root'])
        self._train_split = config['data']['train_split']
        self._val_split = config['data']['val_split']
        self._use_difficult = bool(config["data"].get("use_difficult", False))

        # Creating the file directories
        self._jpeg_dir = self._root / "JPEGImages"
        self._annotation_dir = self._root / "Annotations"
        self._split_dir = self._root / "ImageSets" / "Main"

        if split in ("train", "trainval", "train_val"):
            split_name = self._train_split
        elif split in ("val", "validation"):
            split_name = self._val_split
        else:
            raise ValueError("Wrong Split Name Given")

        # Handling Split directories
        self._split_file = self._split_dir / f"{split_name}.txt"

        self._ids = self.read_ids(self._split_file)
        
    def __len__(self):
        return len(self._ids)

    def _create_hash_signature(self, attributes: dict[str,Any]):
        serialized = json.dumps(attributes, sort_keys=True).encode()
        return hashlib.md5(serialized).hexdigest()

    def _load_raw_sample(self, index: int):
        # Load and decode the Image by reading the file, the annotations from the XML and map class names to the label
        image_id = self.get_image_id(index)
        
        jpeg_path = str(self._jpeg_dir / f"{image_id}.jpg")
        xml_path = str(self._annotation_dir / f"{image_id}.xml")
        
        image = tf.keras.utils.load_img(jpeg_path, color_mode="rgb")
        # Keeping the image in its raw format and will preprocess that later
        image = tf.keras.utils.img_to_array(image)
        image = tf.convert_to_tensor(image, dtype=tf.uint8)

        boxes = []
        labels = []
        difficults = []

        # Reading the XML annotations

        tree = ET.parse(str(xml_path))
        
        root = tree.getroot()

        size = root.find('size')
        if size is None:
            height, width = int(image.shape[0]), int(image.shape[1])
        else:
            width = int(size.findtext("width") or 0)
            height = int(size.findtext("height") or 0)

        if width == 0 or height == 0:
            raise ValueError(f"Unknown width in {xml_path}")

        for annotation_obj in root.findall('object'):
            name = (annotation_obj.findtext("name", "") or "").strip()
            if not name:
                continue

            if name not in self._name_to_id:
                raise ValueError(f"Unknown class '{name}' in {xml_path}")

            difficult = int(annotation_obj.findtext("difficult","0") or "0")
            if (not self._use_difficult) and difficult == 1:
                continue

            bbox = annotation_obj.find("bndbox")
            if bbox is None:
                continue

            # Getting the Coordinates
            x1 = float(bbox.findtext("xmin", "nan"))
            y1 = float(bbox.findtext("ymin", "nan"))
            x2 = float(bbox.findtext("xmax", "nan"))
            y2 = float(bbox.findtext("ymax", "nan"))

            boxes.append([x1,y1,x2,y2])
            labels.append(int(self._name_to_id[name]))
            difficults.append(difficult)

        hash_signature_attributes = {
            'boxes' : boxes,
            'labels' : labels,
            'path': jpeg_path,
            'image_id': image_id,
            'width': width,
            'height': height
        }

        hash_signature = self._create_hash_signature(hash_signature_attributes)

        target = {
            'boxes' : boxes,
            'labels' : labels,
            'path': jpeg_path,
            'image_id': image_id,
            'hash_signature': hash_signature,
            'orig_size': tf.constant([width, height], dtype= tf.int32)
        }

        return image, target
        
    def get_image_id(self, index: int):
        if index < 0 or index >= len(self._ids):
            raise IndexError("Index length is out of bounds")
        return self._ids[index]

    def read_ids(self, file_path: str | Path):
        if isinstance(file_path, str):
            file_path = Path(file_path)

        with open(file_path, "r") as f:
            labels = [line.strip().split(" ")[0] for line in f.readlines() if line.strip()]

        return labels
        

In [9]:
data = VOCDataset(config, "val")

In [10]:
data._name_to_id

{'aeroplane': 1,
 'bicycle': 2,
 'bird': 3,
 'boat': 4,
 'bottle': 5,
 'bus': 6,
 'car': 7,
 'cat': 8,
 'chair': 9,
 'cow': 10,
 'diningtable': 11,
 'dog': 12,
 'horse': 13,
 'motorbike': 14,
 'person': 15,
 'pottedplant': 16,
 'sheep': 17,
 'sofa': 18,
 'train': 19,
 'tvmonitor': 20,
 'background': 0}

In [11]:
data._load_raw_sample(index = 1)

I0000 00:00:1768361730.173758   14531 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1768361730.266523   14531 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1768361730.266600   14531 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1768361730.268383   14531 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1768361730.268487   14531 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

(<tf.Tensor: shape=(333, 500, 3), dtype=uint8, numpy=
 array([[[123, 142, 148],
         [125, 144, 151],
         [129, 147, 159],
         ...,
         [110, 102, 126],
         [112, 103, 130],
         [115, 106, 133]],
 
        [[128, 147, 153],
         [130, 149, 156],
         [132, 150, 162],
         ...,
         [106, 107, 127],
         [106, 107, 128],
         [107, 108, 129]],
 
        [[133, 152, 159],
         [134, 153, 160],
         [136, 154, 164],
         ...,
         [ 96, 102, 118],
         [ 96, 102, 118],
         [ 95, 101, 117]],
 
        ...,
 
        [[ 96, 112, 138],
         [ 98, 114, 139],
         [107, 123, 146],
         ...,
         [ 12,  15,   8],
         [  6,   8,   3],
         [  3,   5,   0]],
 
        [[ 94, 110, 136],
         [ 95, 111, 136],
         [102, 118, 141],
         ...,
         [ 11,  13,   8],
         [  6,   8,   3],
         [  3,   5,   2]],
 
        [[104, 120, 146],
         [101, 117, 142],
         [102,

In [12]:
len(data)

5823

In [13]:
data._load_raw_sample(index = 10)

(<tf.Tensor: shape=(375, 500, 3), dtype=uint8, numpy=
 array([[[ 69, 119, 154],
         [ 66, 118, 155],
         [ 67, 121, 159],
         ...,
         [143, 196, 227],
         [144, 198, 226],
         [145, 199, 225]],
 
        [[ 70, 120, 157],
         [ 68, 120, 159],
         [ 67, 121, 159],
         ...,
         [145, 198, 229],
         [145, 198, 229],
         [145, 199, 227]],
 
        [[ 71, 120, 160],
         [ 70, 122, 162],
         [ 69, 121, 161],
         ...,
         [146, 199, 231],
         [146, 199, 230],
         [146, 199, 230]],
 
        ...,
 
        [[ 34,  40,  36],
         [ 33,  39,  35],
         [ 34,  40,  36],
         ...,
         [ 21,  30,  27],
         [ 22,  31,  28],
         [ 22,  31,  28]],
 
        [[ 41,  50,  45],
         [ 41,  50,  45],
         [ 41,  47,  43],
         ...,
         [ 24,  28,  27],
         [ 25,  29,  28],
         [ 25,  29,  28]],
 
        [[ 37,  43,  39],
         [ 39,  45,  41],
         [ 39,

In [14]:
def build_voc_dataset_config(config: dict[str, Any]):
    dataset_opts = config['data']

    dataset_config = {
        'dataset_name': dataset_opts.get('dataset_name', 'voc'),
        'root': dataset_opts.get('root', ''),
        'train_split': dataset_opts.get('train_split', 'train'),
        'val_split': dataset_opts.get('val_split', 'val'),
        'input_size': dataset_opts.get('input_size', [300,300]),
        'num_workers': dataset_opts.get('num_workers', 4),
        'augment': dataset_opts.get('augment', {}),
        'normalization': dataset_opts.get('normalization', {}),
        'classes_file': dataset_opts.get('classes_file', ''),
    }

    return dataset_config

In [15]:
build_voc_dataset_config(config)

{'dataset_name': 'voc',
 'root': '/mnt/d/dev/MobileNetV2-SSD/datasets/VOCdevkit/VOC2012',
 'train_split': 'trainval',
 'val_split': 'val',
 'input_size': [224, 224],
 'num_workers': 4,
 'augment': {'enabled': True,
  'output_box_norm': True,
  'pipeline': ['photometric_distort',
   'random_expand',
   'random_iou_crop',
   'random_flip'],
  'params': {'random_flip': {'enabled': True,
    'prob': 0.5,
    'direction': 'horizontal'},
   'random_iou_crop': {'enabled': False,
    'prob': 1.0,
    'min_iou_choices': [0.1, 0.3, 0.5, 0.7, 0.9, None],
    'min_scale': 0.3,
    'max_scale': 1.0,
    'max_attempts': 50,
    'fallback': 'original'},
   'random_expand': {'enabled': False,
    'prob': 0.5,
    'max_ratio': 4.0,
    'fill': 'mean',
    'value': [0.485, 0.456, 0.406]},
   'photometric_distort': {'enabled': False,
    'prob': 0.5,
    'brightness': 0.125,
    'contrast': [0.5, 1.5],
    'saturation': [0.5, 1.5],
    'hue': 0.05,
    'random_order': True}}},
 'normalization': {},
 'cla

In [16]:
def build_voc_dataset(config: dict[str, Any], split: str, transform: None):

    dataset = VOCDataset(config, split = split, transform = transform)

    return dataset

In [17]:
dataset = build_voc_dataset(config, "train", transform = None)

In [18]:
dataset[0]

(<tf.Tensor: shape=(375, 500, 3), dtype=uint8, numpy=
 array([[[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        ...,
 
        [[  0,   0,   0],
         [  1,   1,   1],
         [  2,   2,   2],
         ...,
         [  2,   2,   2],
         [  2,   2,   2],
         [  2,   2,   2]],
 
        [[  1,   1,   1],
         [  1,   1,   1],
         [  2,   2,   2],
         ...,
         [  2,   2,   2],
         [  2,   2,   2],
         [  2,   2,   2]],
 
        [[  1,   1,   1],
         [  2,   2,   2],
         [  2,

In [19]:
dataset._load_raw_sample(index = 0)

(<tf.Tensor: shape=(375, 500, 3), dtype=uint8, numpy=
 array([[[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        [[255, 255, 255],
         [255, 255, 255],
         [255, 255, 255],
         ...,
         [255, 255, 255],
         [255, 255, 255],
         [255, 255, 255]],
 
        ...,
 
        [[  0,   0,   0],
         [  1,   1,   1],
         [  2,   2,   2],
         ...,
         [  2,   2,   2],
         [  2,   2,   2],
         [  2,   2,   2]],
 
        [[  1,   1,   1],
         [  1,   1,   1],
         [  2,   2,   2],
         ...,
         [  2,   2,   2],
         [  2,   2,   2],
         [  2,   2,   2]],
 
        [[  1,   1,   1],
         [  2,   2,   2],
         [  2,