In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [2]:
from typing import List, Dict, Any
import tensorflow as tf
import numpy as np

2025-12-09 17:06:53.896968: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-09 17:06:53.932655: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-09 17:06:53.941981: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-09 17:06:54.010795: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from abc import ABC, abstractmethod

## Implementing the helper functions

In [4]:
def box_iou_xyxy(box: np.ndarray, boxes: np.ndarray) -> np.ndarray:
    """
    Compute IoU between a single box and an array of boxes.
    
    Args:
        box:   shape (4,)  [x1, y1, x2, y2]
        boxes: shape (N,4) [x1, y1, x2, y2] for each box

    Returns:
        ious: shape (N,) IoU between `box` and each of `boxes`
    """
    box = np.asarray(box, dtype=np.float32)
    boxes = np.asarray(boxes, dtype=np.float32)

    if boxes.size == 0:
        return np.zeros((0,), dtype=np.float32)

    # Intersection coords
    x1 = np.maximum(box[0], boxes[:, 0])
    y1 = np.maximum(box[1], boxes[:, 1])
    x2 = np.minimum(box[2], boxes[:, 2])
    y2 = np.minimum(box[3], boxes[:, 3])

    inter_w = np.maximum(0.0, x2 - x1)
    inter_h = np.maximum(0.0, y2 - y1)
    inter = inter_w * inter_h

    # Areas
    area_box = (box[2] - box[0]) * (box[3] - box[1])
    area_boxes = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])

    # Union
    union = area_box + area_boxes - inter
    union = np.maximum(union, 1e-6)  # avoid division by zero

    return inter / union


## Implementing the Base Class For Metrics

In [5]:
class BaseMetric(ABC):
    def __init__(self, name:str):
        self._name = name
        
    @abstractmethod
    def update(self,preds: list[Dict[str,Any]], ground_truth: list[Dict[str,Any]]):
        pass
        
    @abstractmethod
    def compute(self):
        pass

    def reset(self):
        pass
        
    @property
    def name(self):
        return self._name

## Implmenting the Derived Classes

In [6]:
class VOCMAP(BaseMetric):
    def __init__(self, iou_thresh,num_classes: int,name: str):
        super().__init__(name = name)
        
        if isinstance(iou_thresh, (list, tuple)):
            self.iou_thresh = [float(t) for t in iou_thresh]
        else:
            self.iou_thresh = [float(iou_thresh)]
            
        self.num_classes = num_classes
        # Initialize the pred & gt lists
        self._preds = []
        self._ground_truth = []

    def reset(self):
        self._preds = []
        self._ground_truth = []

    def update(self,preds,ground_truth):
        for pred in preds:
            self._preds.append(
                (pred['image_id'], pred['boxes'], pred['scores'], pred['labels'])
            )

        for gt in ground_truth:
            self._ground_truth.append(
                (gt['image_id'], gt['boxes'], gt['labels'])
            )

    def compute(self):
        if len(self._ground_truth) == 0:
            results = {}
            for t in self.iou_thresh:
                results[f"mAP@{t}"] = 0.0
            return results

        combined = {}
        for t in self.iou_thresh:
            stats_t = self._compute_for_single_iou(t)
            combined.update(stats_t)

        return combined

    def _compute_for_single_iou(self, iou_thr):
    
        # GT Structures per class
        ground_truth_per_class = {c: {} for c in range(self.num_classes)}
        num_pos_per_class = {c: 0 for c in range(self.num_classes)}

        for image_id, gt_boxes, gt_labels in self._ground_truth:
            # Copying the boxes for calculations
            gt_boxes = np.asarray(gt_boxes,dtype = np.float32)
            gt_labels = np.asarray(gt_labels,dtype = np.int32)

            # Iterating over the boxes and their corresponding labels
            for gt_box, gt_label in zip(gt_boxes, gt_labels):
                class_num = int(gt_label)
                if class_num == 0:
                    # Background which is not needed
                    continue

                if image_id not in ground_truth_per_class[class_num]:
                    # Initial creation of the images records
                    ground_truth_per_class[class_num][image_id] = {
                        "boxes": [],
                        "detected": []
                    }

                ground_truth_per_class[class_num][image_id]["boxes"].append(gt_box)
                ground_truth_per_class[class_num][image_id]["detected"].append(False)
                num_pos_per_class[class_num] = num_pos_per_class[class_num] + 1

        # Making them into numpy arrays for easier reduction
        for class_num in range(self.num_classes):
            for image_id, data in ground_truth_per_class[class_num].items():
                data['boxes'] = np.asarray(data['boxes'], dtype=np.float32)
                data['detected'] = np.asarray(data["detected"], dtype=bool)

        # Calculating the predictions per class
        pred_per_class = {c: [] for c in range(self.num_classes)}

        for image_id, pred_box, pred_scores, pred_labels in self._preds:
            pred_box = np.asarray(pred_box,dtype = np.float32)
            pred_scores = np.asarray(pred_scores,dtype = np.float32)
            pred_labels = np.asarray(pred_labels,dtype = np.int32)

            # Iterating through all the predictions
            for bbox, score, label in zip(pred_box, pred_scores, pred_labels):
                class_num = int(label)
                if class_num == 0:
                    # Background which is not needed
                    continue

                pred_per_class[class_num].append({"image_id": image_id, "box": bbox, "score": float(score)})


        # Now calculating AP per class after creation of the data
        ap_per_class = {}

        for class_num in range(1, self.num_classes):
            preds_for_class = pred_per_class[class_num]
            num_pos = num_pos_per_class[class_num]

            if num_pos == 0:
                # There was no ground truth box for this class
                continue

            if len(preds_for_class) == 0:
                ap_per_class[class_num] = 0.0
                continue

            # Sorting the predictions by score
            preds_for_class.sort(key = lambda data: data['score'],reverse = True)

            TP = np.zeros(len(preds_for_class), dtype = np.float32)
            FP = np.zeros(len(preds_for_class), dtype = np.float32)

            for index, pred in enumerate(preds_for_class):
                image_id = pred['image_id']
                bbox = np.asarray(pred['box'], dtype = np.float32)

                if image_id not in ground_truth_per_class[class_num]:
                    FP[index] = 1.0
                    continue

                ground_truth_data = ground_truth_per_class[class_num][image_id]
                ground_truth_boxes = ground_truth_data['boxes']
                detected = ground_truth_data['detected']

                iou_matrix = box_iou_xyxy(bbox, ground_truth_boxes)
                max_iou_per_index = int(np.argmax(iou_matrix)) if iou_matrix.size > 0 else -1
                max_iou = iou_matrix[max_iou_per_index] if iou_matrix.size > 0 else 0.0

                if max_iou >= iou_thr and not detected[max_iou_per_index]:
                    TP[index] = 1.0
                    detected[max_iou_per_index] = True
                else:
                    FP[index] = 1.0

            TP_cum = np.cumsum(TP)
            FP_cum = np.cumsum(FP)

            # Calculating the recall and precision
            recall = TP_cum / float(num_pos)
            precision = TP_cum / np.maximum(TP_cum + FP_cum, 1e-6) # division by zero safe guard

            # calculate the AP
            ap = self._voc_ap(recall,precision)
            ap_per_class[class_num] = float(ap)

        valid_aps = [ap for c, ap in ap_per_class.items() if num_pos_per_class[c] > 0]

        if len(valid_aps) == 0.0:
            mAP = 0.0
        else:
            mAP = float(np.mean(valid_aps))

        results = {
            f"mAP@{iou_thr}": mAP,
        }

        for c, ap in ap_per_class.items():
            if num_pos_per_class[c] > 0:
                results[f"mAP@{iou_thr}/class_{c}"] = ap

        return results

    def _voc_ap(self, recall: np.ndarray, precision: np.ndarray):
        mrec = np.concatenate(([0.0], recall, [1.0]))
        mpre = np.concatenate(([0.0], precision, [0.0]))

        for i in range(mpre.size - 1, 0, -1):
            mpre[i - 1] = max(mpre[i - 1], mpre[i])

        idx = np.where(mrec[1:] != mrec[:-1])[0]

        ap = 0.0
        for i in idx:
            ap += (mrec[i + 1] - mrec[i]) * mpre[i + 1]

        return ap


In [7]:
class COCOMAP(BaseMetric):
    def __init__(self, iou_thresh: list[float] | None,num_classes: int,name: str):
        super().__init__(name = name)
        self.num_classes = num_classes

        if iou_thresh is None:
            iou_thresh = [0.5 + 0.05 * i for i in range(10)]

        if isinstance(iou_thresh, (list, tuple)):
            self.iou_thresh = [float(t) for t in iou_thresh]
        else:
            self.iou_thresh = [float(iou_thresh)]
        
        # Initialize the pred & gt lists
        self._preds = []
        self._ground_truth = []

    def reset(self):
        self._preds = []
        self._ground_truth = []

    def update(self,preds,ground_truth):
        for pred in preds:
            self._preds.append(
                (pred['image_id'], pred['boxes'], pred['scores'], pred['labels'])
            )

        for gt in ground_truth:
            self._ground_truth.append(
                (gt['image_id'], gt['boxes'], gt['labels'])
            )

    def compute(self):
        if len(self._ground_truth) == 0:
            # No GT at all: define all metrics as 0
            key = f"{self.name}/mAP@[{self.iou_thresh[0]:.2f}:{self.iou_thresh[-1]:.2f}]"
            return {key: 0.0}

        num_iou = len(self.iou_thresh)

        # GT Structures per class
        ground_truth_per_class = {c: {} for c in range(self.num_classes)}
        num_pos_per_class = {c: 0 for c in range(self.num_classes)}

        for image_id, gt_boxes, gt_labels in self._ground_truth:
            # Copying the boxes for calculations
            gt_boxes = np.asarray(gt_boxes,dtype = np.float32)
            gt_labels = np.asarray(gt_labels,dtype = np.int32)

            # Iterating over the boxes and their corresponding labels
            for gt_box, gt_label in zip(gt_boxes, gt_labels):
                class_num = int(gt_label)
                if class_num == 0:
                    # Background which is not needed
                    continue

                if image_id not in ground_truth_per_class[class_num]:
                    # Initial creation of the images records
                    ground_truth_per_class[class_num][image_id] = {
                        "boxes": [],
                    }

                ground_truth_per_class[class_num][image_id]["boxes"].append(gt_box)
                num_pos_per_class[class_num] = num_pos_per_class[class_num] + 1

        for class_num in range(self.num_classes):
            for image_id, data in ground_truth_per_class[class_num].items():
                data['boxes'] = np.asarray(data['boxes'], dtype=np.float32)
                
        # Calculating the predictions per class
        pred_per_class = {c: [] for c in range(self.num_classes)}

        for image_id, pred_box, pred_scores, pred_labels in self._preds:
            pred_box = np.asarray(pred_box,dtype = np.float32)
            pred_scores = np.asarray(pred_scores,dtype = np.float32)
            pred_labels = np.asarray(pred_labels,dtype = np.int32)

            # Iterating through all the predictions
            for bbox, score, label in zip(pred_box, pred_scores, pred_labels):
                class_num = int(label)
                if class_num == 0:
                    # Background which is not needed
                    continue

                pred_per_class[class_num].append({"image_id": image_id, "box": bbox, "score": float(score)})

        AP = np.full((self.num_classes, num_iou), np.nan, dtype=np.float32)

        for class_num in range(1, self.num_classes):
            preds_for_class = pred_per_class[class_num]
            num_pos = num_pos_per_class[class_num]

            if num_pos == 0:
                # There was no ground truth box for this class
                continue

            if len(preds_for_class) == 0:
                ap_per_class[class_num] = 0.0
                continue

            # Sorting the predictions by score
            preds_for_class.sort(key = lambda data: data['score'],reverse = True)

            for index, iou_thr in enumerate(self.iou_thresh):
                detected_flags = {}
                
                for image_id, data in ground_truth_per_class[class_num].items():
                    num_gt = data["boxes"].shape[0]
                    detected_flags[image_id] = np.zeros(num_gt, dtype=bool)

                TP = np.zeros(len(preds_for_class), dtype = np.float32)
                FP = np.zeros(len(preds_for_class), dtype = np.float32)

                for pred_index, pred in enumerate(preds_for_class):
                    image_id = pred['image_id']
                    bbox = np.asarray(pred['box'], dtype = np.float32)

                    if image_id not in ground_truth_per_class[class_num]:
                        FP[pred_index] = 1.0
                        continue

                    ground_truth_data = ground_truth_per_class[class_num][image_id]
                    ground_truth_boxes = ground_truth_data['boxes']
                    det_flags = detected_flags[image_id]

                    iou_matrix = box_iou_xyxy(bbox, ground_truth_boxes)
                    if iou_matrix.size == 0:
                        FP[pred_index] = 1.0
                        continue

                    max_iou_index = int(np.argmax(iou_matrix))
                    max_iou = float(iou_matrix[max_iou_index])

                    if max_iou >= iou_thr and not det_flags[max_iou_index]:
                        TP[pred_index] = 1.0
                        det_flags[max_iou_index] = True
                    else:
                        FP[pred_index] = 1.0

                TP_cum = np.cumsum(TP)
                FP_cum = np.cumsum(FP)

                recall = TP_cum / float(num_pos)
                precision = TP_cum / np.maximum(TP_cum + FP_cum, 1e-6) # division by zero safe guard

                AP[class_num,index] = self._coco_ap_101(recall, precision)
                
        valid_classes  = [c for c in range(1, self.num_classes) if num_pos_per_class[c] > 0]

        if len(valid_classes) == 0:
            mAP = 0.0
            ap50 = 0.0
            ap75 = 0.0
        else:
            AP_valid = AP[valid_classes, :]
            mAP = float(np.nanmean(AP_valid))
            
            ap50 = None
            ap75 = None

            if 0.5 in self.iou_thresh:
                t50_idx = self.iou_thresh.index(0.5)
                ap50 = float(np.nanmean(AP_valid[:, t50_idx]))
            if 0.75 in self.iou_thresh:
                t75_idx = self.iou_thresh.index(0.75)
                ap75 = float(np.nanmean(AP_valid[:, t75_idx]))

        key_main = f"{self.name}/mAP@[{self.iou_thresh[0]:.2f}:{self.iou_thresh[-1]:.2f}]"
        results: dict[str, float] = {key_main: mAP}

        if ap50 is not None:
            results[f"{self.name}/AP@0.50"] = ap50
        if ap75 is not None:
            results[f"{self.name}/AP@0.75"] = ap75

        return results

    def _coco_ap_101(self, recall: np.ndarray, precision: np.ndarray):
        
        if recall.size == 0:
            return 0.0

        rec = np.asarray(recall, dtype=np.float32)
        prec = np.asarray(precision, dtype=np.float32)

        recall_samples = np.linspace(0.0, 1.0, 101, dtype=np.float32)
        precisions_interp = np.zeros_like(recall_samples)

        for i, r in enumerate(recall_samples):
            # precision at recall >= r
            mask = rec >= r
            if np.any(mask):
                precisions_interp[i] = np.max(prec[mask])
            else:
                precisions_interp[i] = 0.0

        return float(np.mean(precisions_interp))

## Implemented The Manager

In [8]:
class MetricsManager:
    def __init__(self, metrics:list[BaseMetric], prefix: str | None = None):
        self.metrics = metrics
        self.prefix = prefix or ""

    def update(self, pred: dict[str,Any], ground_truth: dict[str,Any]):
        for metric in self.metrics:
            metric.update(pred,ground_truth)

    def compute(self):
        combined: dict[str,float] = {}
        for metric in self.metrics:
            stat = metric.compute()
            for key,value in stat.items():
                k = f"{self.prefix}{metric.name}/{key}"
                combined[k] = value
        return combined
        
    def reset(self):
        for metric in self.metrics:
            metric.reset()

# Implemented Factory Pattern Using Function

In [9]:
def build_metrics_config(config: dict):

    eval_cfg = config["eval"]
    metric_cfg = eval_cfg["metrics"]
    num_classes = config["model"]["num_classes"]
    
    metrics: list[BaseMetric] = []

    for metric_name, mc in metric_cfg.items():
        metric_type = mc.get("type", "voc_ap")
        if metric_type == "voc_ap":
            metrics.append(
                VOCMAP(
                    iou_thresh=mc.get("iou_thresholds", [0.5]),
                    num_classes=num_classes,
                    name=metric_name,  # use key from YAML
                )
            )

        elif metric_type == "coco_map":
            metrics.append(
                COCOMAP(
                    iou_thresh=mc.get("iou_thresholds",
                                          [0.5, 0.55, 0.6, 0.65,
                                           0.7, 0.75, 0.8, 0.85,
                                           0.9, 0.95]),
                    num_classes=num_classes,
                    name=metric_name,
                )
            )

        else:
            raise ValueError(f"Unknown metric type: {metric_type!r} for {metric_name!r}")

    return MetricsManager(metrics=metrics, prefix=f"{eval_cfg['dataset_split']}/")
    

In [10]:
from mobilenetv2ssd.core.config import load_config

In [11]:
main_cfg_path = "configs/train/default.yaml"
model_cfg_path = "configs/model/mobilenetv2_ssd_voc.yaml"
data_cfg_path = "configs/data/voc_224.yaml"
eval_cfg_path = "configs/eval/default.yaml"

## Tests

In [12]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [13]:
config['eval']

{'dataset_split': 'val',
 'nms': {'iou_threshold': 0.5,
  'score_threshold': 0.05,
  'max_detections_per_image': 100,
  'max_detections_per_class': 50},
 'metrics': {'voc_ap_50': {'type': 'voc_ap',
   'iou_thresholds': [0.5, 0.75],
   'use_07_metric': False},
  'coco_map': {'type': 'coco_map',
   'iou_thresholds': [0.5, 0.75],
   'use_07_metric': False}},
 'visualization': {'enabled': False,
  'max_images': 16,
  'output_dir': '/mnt/d/dev/MobileNetV2-SSD/eval_vis'}}

In [14]:
metrics_manager = build_metrics_config(config)

In [15]:
metrics_manager.compute()

{'val/voc_ap_50/mAP@0.5': 0.0,
 'val/voc_ap_50/mAP@0.75': 0.0,
 'val/coco_map/coco_map/mAP@[0.50:0.75]': 0.0}

In [16]:
gts = [
    {
        "image_id": "img1",
        "boxes":  [[0.0, 0.0, 100.0, 100.0]],  # one GT box
        "labels": [1],
    }
]

# One prediction exactly on top of GT
preds = [
    {
        "image_id": "img1",
        "boxes":  [[0.0, 0.0, 100.0, 100.0]],  # same as GT
        "scores": [0.9],
        "labels": [1],
    }
]

In [17]:
metrics_manager.update(preds,gts)

In [18]:
metrics_manager.compute()

{'val/voc_ap_50/mAP@0.5': 1.0,
 'val/voc_ap_50/mAP@0.5/class_1': 1.0,
 'val/voc_ap_50/mAP@0.75': 1.0,
 'val/voc_ap_50/mAP@0.75/class_1': 1.0,
 'val/coco_map/coco_map/mAP@[0.50:0.75]': 1.0,
 'val/coco_map/coco_map/AP@0.50': 1.0,
 'val/coco_map/coco_map/AP@0.75': 1.0}

In [19]:
gts = [
    {
        "image_id": "img1",
        "boxes":  [[0.0, 0.0, 100.0, 100.0]],
        "labels": [1],
    }
]

preds = [
    {
        "image_id": "img1",
        "boxes": [
            [200.0, 200.0, 300.0, 300.0],  # FP: no overlap with GT
            [0.0,   0.0,   100.0, 100.0],  # TP: exact match
        ],
        "scores": [0.9, 0.8],  # FP ranked above TP
        "labels": [1, 1],
    }
]


In [20]:
metrics_manager = build_metrics_config(config)
metrics_manager.update(preds,gts)
metrics_manager.compute()

{'val/voc_ap_50/mAP@0.5': 0.5,
 'val/voc_ap_50/mAP@0.5/class_1': 0.5,
 'val/voc_ap_50/mAP@0.75': 0.5,
 'val/voc_ap_50/mAP@0.75/class_1': 0.5,
 'val/coco_map/coco_map/mAP@[0.50:0.75]': 0.5,
 'val/coco_map/coco_map/AP@0.50': 0.5,
 'val/coco_map/coco_map/AP@0.75': 0.5}

In [21]:
gts = [
    # img1: cat + dog
    {
        "image_id": "img1",
        "boxes":  [
            [0.0,   0.0,   100.0, 100.0],   # cat
            [150.0, 150.0, 250.0, 250.0],   # dog
        ],
        "labels": [1, 2],
    },
    # img2: only dog
    {
        "image_id": "img2",
        "boxes":  [
            [50.0, 50.0, 150.0, 150.0],    # dog
        ],
        "labels": [2],
    },
]

preds = [
    # Predictions for img1
    {
        "image_id": "img1",
        "boxes": [
            [0.0,   0.0,   100.0, 100.0],   # good cat (TP)
            [160.0, 160.0, 260.0, 260.0],   # decent dog (TP, IoU > 0.5-ish)
            [300.0, 300.0, 400.0, 400.0],   # random FP (no GT)
        ],
        "scores": [0.9, 0.8, 0.3],
        "labels": [1,   2,   1],           # last box wrongly predicted as cat
    },
    # Predictions for img2
    {
        "image_id": "img2",
        "boxes": [
            [60.0, 60.0, 140.0, 140.0],     # decent dog (TP)
        ],
        "scores": [0.95],
        "labels": [2],
    },
]


In [22]:
metrics_manager = build_metrics_config(config)
metrics_manager.update(preds,gts)
metrics_manager.compute()

{'val/voc_ap_50/mAP@0.5': 1.0,
 'val/voc_ap_50/mAP@0.5/class_1': 1.0,
 'val/voc_ap_50/mAP@0.5/class_2': 1.0,
 'val/voc_ap_50/mAP@0.75': 0.5,
 'val/voc_ap_50/mAP@0.75/class_1': 1.0,
 'val/voc_ap_50/mAP@0.75/class_2': 0.0,
 'val/coco_map/coco_map/mAP@[0.50:0.75]': 0.75,
 'val/coco_map/coco_map/AP@0.50': 1.0,
 'val/coco_map/coco_map/AP@0.75': 0.5}

## Implemented The Adapter Pattern to Convert Prediction To Metric Input Format

In [23]:
def convert_predictions_to_metric_format(nmsed_boxes, nmsed_scores, nmsed_classes, image_id, gt_boxes_xyxy, gt_labels, gt_valid_mask):
    preds = []
    gt = []

    boxes = nmsed_boxes
    scores = nmsed_scores
    labels = nmsed_classes

    gt_boxes = gt_boxes_xyxy
    gt_labels = gt_labels
    gt_masks = gt_valid_mask

    image_ids = image_id

    B = tf.shape(boxes)[0]

    for i in range(B):
        img_id = image_ids[i]

        # Prediction boxes
        pred_box = boxes[i]
        pred_scores = scores[i]
        pred_labels = labels[i]

        preds.append({"image_id": img_id.numpy(),"boxes":   pred_box.numpy(), "scores":  pred_scores.numpy(),"labels":  pred_labels.numpy()})

        # Ground truth box
        gt_box = gt_boxes[i]
        gt_label = gt_labels[i]
        gt_mask = gt_masks[i]

        gt_box = tf.boolean_mask(gt_box,gt_mask)
        gt_label = tf.boolean_mask(gt_labels,gt_mask)

        gt.append({"image_id": img_id.numpy(), "boxes":   gt_box.numpy(), "labels":  gt_label.numpy()})


    return preds, gt
        
    

In [24]:
# NOTE: PREDS ARE IN XY COORDINATES
boxes = tf.constant([
        # image 0: one det
        [[10.0, 10.0, 50.0, 50.0],    # det0
         [0.0, 0.0, 0.0, 0.0]],       # padding
        # image 1: one det
        [[70.0, 70.0, 120.0, 120.0],  # det0 (far away from GT)
         [0.0, 0.0, 0.0, 0.0]],       # padding
    ], dtype=tf.float32)   

scores = tf.constant([
        [0.9, 0.0],   # image 0
        [0.8, 0.0],   # image 1
    ], dtype=tf.float32)

labels = tf.constant([
        [1, 0],       # image 0: first det is class 1, second padding
        [1, 0],       # image 1: first det is class 1, second padding
    ], dtype=tf.int32)

I0000 00:00:1765318017.918077   33985 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1765318018.004198   33985 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1765318018.004260   33985 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1765318018.006057   33985 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1765318018.006142   33985 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [25]:
image_id = tf.constant(["101", "102"], dtype=tf.string)

gt_boxes = tf.constant([
        # image 0: one GT box
        [[10.0, 10.0, 50.0, 50.0],
         [0.0, 0.0, 0.0, 0.0]],
        # image 1: one GT box
        [[10.0, 10.0, 50.0, 50.0],
         [0.0, 0.0, 0.0, 0.0]],
    ], dtype=tf.float32)

gt_labels =  tf.constant([
        [1, 0],   # image 0: one GT of class 1
        [1, 0],   # image 1
    ], dtype=tf.int32)

gt_mask =  tf.constant([
        [True, False],   # image 0: only first GT valid
        [True, False],   # image 1
    ], dtype=tf.bool)

In [26]:
convert_predictions_to_metric_format(boxes, scores, labels, image_id, gt_boxes, gt_labels, gt_mask)

([{'image_id': b'101',
   'boxes': array([[10., 10., 50., 50.],
          [ 0.,  0.,  0.,  0.]], dtype=float32),
   'scores': array([0.9, 0. ], dtype=float32),
   'labels': array([1, 0], dtype=int32)},
  {'image_id': b'102',
   'boxes': array([[ 70.,  70., 120., 120.],
          [  0.,   0.,   0.,   0.]], dtype=float32),
   'scores': array([0.8, 0. ], dtype=float32),
   'labels': array([1, 0], dtype=int32)}],
 [{'image_id': b'101',
   'boxes': array([[10., 10., 50., 50.]], dtype=float32),
   'labels': array([[1, 0]], dtype=int32)},
  {'image_id': b'102',
   'boxes': array([[10., 10., 50., 50.]], dtype=float32),
   'labels': array([[1, 0]], dtype=int32)}])