In [1]:
import pandas as pd
from typing import Iterable, Literal, overload
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2 as cv
import numpy as np
import math
import os

In [2]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [3]:
import tensorflow as tf

2025-12-02 23:44:58.567802: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-02 23:44:58.589411: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-02 23:44:58.597576: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-02 23:44:58.614552: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Sanitize the ground truth boxes

In [4]:
def _sanitize_boxes_xyxy(boxes_normalized: tf.Tensor):
    # Making sure the format of the boxes is correct
    x_min, y_min, x_max, y_max = tf.split(boxes_normalized,num_or_size_splits = 4, axis=-1)

    # Making sure the coordinate relationship is maintained
    x_min_filtered = tf.minimum(x_min,x_max)
    y_min_filtered = tf.minimum(y_min,y_max)
    x_max_filtered = tf.maximum(x_min,x_max)
    y_max_filtered = tf.maximum(y_min,y_max)

    # Making sure the values are normalized
    x_min_clipped = tf.clip_by_value(x_min_filtered,0,1)
    y_min_clipped = tf.clip_by_value(y_min_filtered,0,1)
    x_max_clipped = tf.clip_by_value(x_max_filtered,0,1)
    y_max_clipped = tf.clip_by_value(y_max_filtered,0,1)

    return tf.concat([x_min_clipped,y_min_clipped,x_max_clipped,y_max_clipped],axis=-1)

In [5]:
boxes_1 = tf.constant([0.8, 0.8, 0.2, 0.3])

I0000 00:00:1764737100.196358    1367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764737100.284421    1367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764737100.284486    1367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764737100.285686    1367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764737100.285749    1367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [6]:
_sanitize_boxes_xyxy(boxes_1)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.2, 0.3, 0.8, 0.8], dtype=float32)>

In [7]:
boxes_2 = tf.constant([-0.1, 0.4, 1.2, 1.3])

In [8]:
_sanitize_boxes_xyxy(boxes_2)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0. , 0.4, 1. , 1. ], dtype=float32)>

In [9]:
boxes_3 = tf.constant([0.2, 0.9, 0.7, 0.2])

In [10]:
_sanitize_boxes_xyxy(boxes_3)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.2, 0.2, 0.7, 0.9], dtype=float32)>

## Encode Boxes

In [11]:
def xyxy_to_cxcywh_core(boxes : tf.Tensor):

    # Checking the type
    tf.debugging.assert_equal(tf.shape(boxes)[-1], 4, message="boxes last dim must be 4")
    
    # Converting XY- Coordinate boxes into Center format
    x_min, y_min, x_max, y_max = tf.split(boxes,num_or_size_splits = 4, axis=-1)

    cx = (x_max + x_min)/2
    cy = (y_max + y_min)/2

    w = x_max - x_min
    h = y_max - y_min

    return tf.concat([cx,cy,w,h], axis = -1)

In [12]:
def encode_boxes_core(gt_boxes_xyxy: tf.Tensor, priors_cxcywh: tf.Tensor, variance: tuple[float,float]):
    # Convert boxes to center coordinates
    gt_boxes_cxcywh = xyxy_to_cxcywh_core(gt_boxes_xyxy)
    
    gt_xc, gt_yc, gt_w, gt_h = tf.split(gt_boxes_cxcywh,num_or_size_splits = 4, axis=-1)

    prior_xc, prior_yc, prior_w, prior_h = tf.split(priors_cxcywh,num_or_size_splits = 4, axis=-1)

    variance_center = variance[0]
    variance_size = variance[1]

    # Protecting a division by zero to be inf or -inf
    eps = tf.constant(1e-8, dtype=gt_boxes_xyxy.dtype)
    gt_w = tf.maximum(eps,gt_w)
    gt_h = tf.maximum(eps,gt_h)
    prior_w = tf.maximum(eps,prior_w)
    prior_h = tf.maximum(eps,prior_h)

    # Calculate the offsets using the formulae from the paper
    tx = ((gt_xc - prior_xc)/prior_w) / variance_center
    ty = ((gt_yc - prior_yc)/prior_h) / variance_center
    tw = (tf.math.log(gt_w/prior_w)) / variance_size
    th = (tf.math.log(gt_h/prior_h)) / variance_size

    offsets = tf.concat([tx,ty,tw,th], axis= -1)

    # Calculate where the gt_boxes are padded
    padded_mask = tf.reduce_all(gt_boxes_xyxy == 0.0, axis=-1)

    update_mask = tf.expand_dims(padded_mask,axis=-1)

    # Where the mask is True, the GT box is padded there so to return the value to be 0.0, everywhere else keep it as is
    offsets = tf.where(update_mask,tf.zeros_like(offsets),offsets)

    return  offsets

In [13]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy = tf.constant([0.4, 0.4, 0.6, 0.6], dtype = tf.float32)
variance = (0.1,0.2)

In [14]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance)

<tf.Tensor: shape=(4,), dtype=float32, numpy=
array([0.000000e+00, 0.000000e+00, 5.960464e-07, 5.960464e-07],
      dtype=float32)>

In [15]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy = tf.constant([0.44, 0.4, 0.64, 0.6], dtype = tf.float32)
variance = (0.1,0.2)

In [16]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance)

<tf.Tensor: shape=(4,), dtype=float32, numpy=
array([ 1.9999981e+00,  0.0000000e+00, -2.9802322e-07,  5.9604639e-07],
      dtype=float32)>

In [17]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy = tf.constant([0.3, 0.3, 0.7, 0.7], dtype = tf.float32)
variance = (0.1,0.2)

In [18]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([0.       , 0.       , 3.4657357, 3.4657357], dtype=float32)>

In [19]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy = tf.constant([0.45, 0.45, 0.55, 0.55], dtype = tf.float32)
variance = (0.1,0.2)

In [20]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance)

<tf.Tensor: shape=(4,), dtype=float32, numpy=array([ 0.       ,  0.       , -3.4657347, -3.4657347], dtype=float32)>

In [21]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy =  tf.constant([
    [0.10, 0.10, 0.30, 0.30],  # GT 0  (class e.g. 3)
    [0.55, 0.55, 0.85, 0.85],  # GT 1  (class e.g. 2)
    [0.20, 0.50, 0.40, 0.80],  # GT 2  (class e.g. 5)
    [0.00, 0.00, 0.00, 0.00],  # padded
], dtype=tf.float32)
variance = (0.1,0.2)

In [22]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-1.5000000e+01, -1.5000000e+01,  5.9604639e-07,  5.9604639e-07],
       [ 1.0000002e+01,  1.0000002e+01,  2.0273256e+00,  2.0273256e+00],
       [-9.9999990e+00,  7.4999986e+00,  0.0000000e+00,  2.0273256e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)>

## Encoding On A Batch Level

In [23]:
def encode_boxes_batch(matched_gt_xyxy: tf.Tensor,priors_cxcywh: tf.Tensor, variances: tuple[float,float]):
    # Need to create a function that encodes boxes by batch
    B = tf.shape(matched_gt_xyxy)[0]

    # Map over the entire batch
    batched_offsets = tf.map_fn(lambda matched_boxes: encode_boxes_core(matched_boxes,priors_cxcywh,variances), elems=matched_gt_xyxy,fn_output_signature=tf.TensorSpec(shape=(None, 4), dtype=tf.float32))

    return batched_offsets

In [24]:
matched_gt_xyxy_batch = tf.constant(
[
    # Image 0: (N=3 priors)
    [
        [0.1, 0.1, 0.3, 0.3],  # prior 0 matched to GT box A
        [0.6, 0.6, 0.9, 0.9],  # prior 1 matched to GT box B
        [0.0, 0.0, 0.0, 0.0],  # prior 2 background (padded)
    ],
    # Image 1:
    [
        [0.0, 0.0, 0.0, 0.0],  # prior 0 background
        [0.6, 0.6, 0.9, 0.9],  # prior 1 matched to GT box B
        [0.6, 0.6, 0.9, 0.9],  # prior 2 also matched to GT box B
    ],
],
dtype=tf.float32)
priors = tf.constant([
    [0.2, 0.2, 0.2, 0.2],  # prior 0
    [0.7, 0.7, 0.3, 0.3],  # prior 1
    [0.5, 0.5, 0.4, 0.4],  # prior 2
], dtype=tf.float32) 

In [25]:
encode_boxes_batch(matched_gt_xyxy_batch,priors,(0.1, 0.2))

<tf.Tensor: shape=(2, 3, 4), dtype=float32, numpy=
array([[[ 0.0000000e+00,  0.0000000e+00,  5.9604639e-07,  5.9604639e-07],
        [ 1.6666670e+00,  1.6666670e+00, -8.9406973e-07, -8.9406973e-07],
        [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],

       [[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 1.6666670e+00,  1.6666670e+00, -8.9406973e-07, -8.9406973e-07],
        [ 6.2500000e+00,  6.2500000e+00, -1.4384111e+00, -1.4384111e+00]]],
      dtype=float32)>