In [35]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [36]:
import numpy as np
from PIL import Image
from typing import Any
from pathlib import Path
import hashlib
import json
import xml.etree.ElementTree as ET

In [37]:
from mobilenetv2ssd.core.config import load_config

In [38]:
main_cfg_path = "configs/train/default.yaml"
model_cfg_path = "configs/model/mobilenetv2_ssd_voc.yaml"
data_cfg_path = "configs/data/voc_224.yaml"
eval_cfg_path = "configs/eval/default.yaml"

In [39]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [40]:
from datasets.base import BaseDetectionDataset, DetectionSample

In [41]:
class VOCDataset(BaseDetectionDataset):
    def __init__(self, root: str | Path, split: str, classes_file: str | Path, use_difficult: bool = False):
        super().__init__(root, split, classes_file, use_difficult)

        self.jpeg_dir = self.root / "JPEGImages"
        self.annotation_dir = self.root / "Annotations"
        self.split_dir = self.root / "ImageSets" / "Main"

        # Validating the directories
        self._validate_paths()

        self.image_ids = self._load_image_ids()

        if len(self.image_ids) == 0:
            raise ValueError(f"No images found for split '{split}'")
        
    def _validate_paths(self):
        # Checking if the directory exists or not
        if not self.jpeg_dir.exists():
            raise FileNotFoundError(f"JPEGImages directory not found: {self.jpeg_dir}")
        
        if not self.annotation_dir.exists():
            raise FileNotFoundError(f"Annotations directory not found: {self.annotation_dir}")

        if not self.split_dir.exists():
            raise FileNotFoundError(f"ImageSets/Main directory not found: {self.split_dir}")

    def _load_image_ids(self):
        # Loading the ids from the split file to get the proper images
        
        split_file = self.split_dir / f"{self.split}.txt"
        
        if not split_file.exists():
            raise FileNotFoundError(f"Split file not found: {split_file}")

        with open(split_file, "r") as file:
            ids = []
            for line in file:
                line = line.strip()
                if line:
                    parts = line.split()
                    ids.append(parts[0])

        return ids

    def __len__(self) -> int:
        return len(self.image_ids)

    def _load_image(self, path: Path):
        if not path.exists():
            raise FileNotFoundError(f"Image not found: {path}")

        # Read the file
        image = Image.open(path).convert("RGB")
        return np.array(image, dtype = np.float32)

    def _parse_annotation(self, path: Path):

        if not path.exists():
             return np.zeros((0, 4), dtype=np.float32), np.zeros((0,), dtype=np.int32)

        # Loading the XML annotation
        tree = ET.parse(path)
        root = tree.getroot()

        boxes = []
        labels = []

        for obj in root.findall("object"):

            # Getting the name
            name = (obj.findtext("name") or "").strip()
            if not name or name not in self.class_to_index:
                continue

            # Getting the difficult flag
            difficult = int(obj.findtext("difficult") or "0")
            if difficult and not self.use_difficult:
                continue

            # Getting the bounding box
            bbox = obj.find("bndbox")
            if bbox is None:
                continue

            try:
                x1 = float(bbox.findtext("xmin") or 0)
                y1 = float(bbox.findtext("ymin") or 0)
                x2 = float(bbox.findtext("xmax") or 0)
                y2 = float(bbox.findtext("ymax") or 0)
            except (ValueError, TypeError):
                continue

            # Making sure invalid boxes dont make it through
            if x2 <= x1 or y2 <= y1:
                continue

            boxes.append([x1, y1, x2, y2])
            labels.append(self.class_to_index[name])

        if boxes:
            return np.array(boxes, dtype= np.float32), np.array(labels, dtype=np.int32)
        else:
            return np.zeros((0, 4), dtype=np.float32), np.zeros((0,), dtype=np.int32)

    def _load_sample(self, index: int):
        
        image_id = self.image_ids[index]

        # Loading the image
        image_path = self.jpeg_dir / f"{image_id}.jpg"
        image = self._load_image(image_path)

        # Loading the annotation
        annotation_path = self.annotation_dir / f"{image_id}.xml"
        boxes, labels = self._parse_annotation(annotation_path)

        return DetectionSample(
            image = image, 
            boxes = boxes,
            labels = labels,
            image_id = image_id,
            path = str(image_path),
            orig_size = image.shape[:2]
        )

In [42]:
config['data']['classes_file']

'/mnt/d/dev/MobileNetV2-SSD/datasets/VOCdevkit/labels/voc_labels.txt'

In [43]:
VOCDataset(root = config['data']['root'], split = "train", classes_file = config['data']['classes_file'], use_difficult = False)

<__main__.VOCDataset at 0x79c38034e620>

In [44]:
data = VOCDataset(root = config['data']['root'], split = "train", classes_file = config['data']['classes_file'], use_difficult = False)

In [45]:
data.class_names

['aeroplane',
 'bicycle',
 'bird',
 'boat',
 'bottle',
 'bus',
 'car',
 'cat',
 'chair',
 'cow',
 'diningtable',
 'dog',
 'horse',
 'motorbike',
 'person',
 'pottedplant',
 'sheep',
 'sofa',
 'train',
 'tvmonitor']

In [46]:
data._load_sample(index = 1)

DetectionSample(image=array([[[ 82., 108., 131.],
        [ 87., 113., 136.],
        [ 93., 120., 141.],
        ...,
        [160., 105.,  51.],
        [160., 105.,  51.],
        [158., 103.,  49.]],

       [[ 89., 115., 138.],
        [ 91., 117., 140.],
        [ 91., 118., 139.],
        ...,
        [159., 104.,  50.],
        [158., 103.,  49.],
        [155., 100.,  46.]],

       [[ 93., 119., 142.],
        [ 91., 117., 140.],
        [ 87., 114., 135.],
        ...,
        [158., 103.,  49.],
        [155., 100.,  46.],
        [152.,  97.,  43.]],

       ...,

       [[ 46.,  40.,  50.],
        [ 40.,  33.,  41.],
        [ 32.,  22.,  31.],
        ...,
        [ 43.,  12.,   7.],
        [ 46.,  15.,  10.],
        [ 49.,  18.,  13.]],

       [[ 53.,  47.,  59.],
        [ 48.,  42.,  52.],
        [ 50.,  43.,  51.],
        ...,
        [ 48.,  15.,  10.],
        [ 51.,  18.,  13.],
        [ 54.,  21.,  16.]],

       [[ 63.,  57.,  71.],
        [ 57.,  51.,  

In [47]:
len(data)

5717

In [48]:
data._load_sample(index = 10)

DetectionSample(image=array([[[ 46.,  44.,  32.],
        [ 47.,  48.,  32.],
        [ 44.,  46.,  32.],
        ...,
        [ 14.,  23.,  30.],
        [ 12.,  20.,  22.],
        [ 15.,  23.,  25.]],

       [[ 39.,  39.,  29.],
        [ 22.,  26.,  11.],
        [ 13.,  24.,   7.],
        ...,
        [  6.,  11.,  17.],
        [  6.,  15.,  20.],
        [  9.,  23.,  32.]],

       [[ 10.,  17.,  10.],
        [ 19.,  27.,  14.],
        [ 23.,  31.,  18.],
        ...,
        [ 68.,  73.,  69.],
        [ 19.,  22.,  13.],
        [ 12.,  21.,  16.]],

       ...,

       [[106., 104., 109.],
        [111., 113., 110.],
        [108., 113., 106.],
        ...,
        [ 12.,   8.,   7.],
        [ 12.,  11.,   9.],
        [ 10.,   9.,   5.]],

       [[106., 106., 108.],
        [111., 113., 110.],
        [105., 112., 105.],
        ...,
        [ 12.,   9.,   2.],
        [ 13.,  10.,   3.],
        [ 11.,  11.,   3.]],

       [[109., 109., 111.],
        [107., 112., 1

In [49]:
def build_voc_dataset_config( root: str | Path,split: str,classes_file: str | Path,use_difficult: bool = False,):

    return VOCDataset(root = root, split = split, classes_file = classes_file, use_difficult = use_difficult)

In [50]:
dataset = build_voc_dataset_config(root = config['data']['root'], split = "train", classes_file = config['data']['classes_file'], use_difficult = False)

In [51]:
dataset[1]

DetectionSample(image=array([[[ 82., 108., 131.],
        [ 87., 113., 136.],
        [ 93., 120., 141.],
        ...,
        [160., 105.,  51.],
        [160., 105.,  51.],
        [158., 103.,  49.]],

       [[ 89., 115., 138.],
        [ 91., 117., 140.],
        [ 91., 118., 139.],
        ...,
        [159., 104.,  50.],
        [158., 103.,  49.],
        [155., 100.,  46.]],

       [[ 93., 119., 142.],
        [ 91., 117., 140.],
        [ 87., 114., 135.],
        ...,
        [158., 103.,  49.],
        [155., 100.,  46.],
        [152.,  97.,  43.]],

       ...,

       [[ 46.,  40.,  50.],
        [ 40.,  33.,  41.],
        [ 32.,  22.,  31.],
        ...,
        [ 43.,  12.,   7.],
        [ 46.,  15.,  10.],
        [ 49.,  18.,  13.]],

       [[ 53.,  47.,  59.],
        [ 48.,  42.,  52.],
        [ 50.,  43.,  51.],
        ...,
        [ 48.,  15.,  10.],
        [ 51.,  18.,  13.],
        [ 54.,  21.,  16.]],

       [[ 63.,  57.,  71.],
        [ 57.,  51.,  