In [1]:
import pandas as pd
from typing import Iterable, Literal, overload
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2 as cv
import numpy as np
import math
import os

In [2]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [3]:
import tensorflow as tf

2025-12-01 20:19:09.114728: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-01 20:19:09.206220: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-01 20:19:09.221286: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-01 20:19:09.297126: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Localization Head

In [4]:
class LocalizationHead(tf.keras.Model):
    def __init__(self,name: str, num_anchors_per_location: list[int], **kwargs):
        super().__init__(name=name)
        
        self.heads = []
        self.head_type = kwargs['head_type']
        self.heads = []
        self.num_anchors_per_layer = num_anchors_per_location
        
        self.initial_norm = self.make_normalization(kwargs.get('initial_norm_strategy', "BatchNorm"))
        
        self.squeeze_heads = []
        self.squeeze_ratio = kwargs.get('squeeze_ratio',1.0)

        self.intermediate_channels = kwargs.get('intermediate_conv',1)
        self.intermediate_heads = None if self.intermediate_channels == 1 else []

    def call(self,feature_maps,training = False):
        outputs = []
        for layer, feature_map in enumerate(feature_maps):
            num_anchors = self.num_anchors_per_layer[layer]
            
            # Getting the feature map
            x = feature_map

            # Initial Norm
            if self.initial_norm is not None and layer == 0:
                x = self.initial_norm(x,training = training)

            # Squeeze Layer
            if self.squeeze_ratio != 1.0:
                x = self.squeeze_heads[layer](x,training = training)
                
            # Intermediate Conv
            if self.intermediate_heads is not None:
                x = self.intermediate_heads[layer](x, training=training)
            
            # Prediction Conv
            x = self.heads[layer](x,training = training)

            # Reshape
            B = tf.shape(x)[0]
            H = tf.shape(x)[1]
            W = tf.shape(x)[2]

            x = tf.reshape(x, [B, H, W, num_anchors, 4])
            x = tf.reshape(x, [B, H * W * num_anchors, 4])

            # Append the value
            outputs.append(x)

        # Concatenate
        final_output = tf.concat(outputs,axis=1)
        return final_output

    def build(self,input_shape):
        for layer,feature_map_shape in enumerate(input_shape):
            channel = int(feature_map_shape[-1])

            # Need to calculate the squeeze heads
            if self.squeeze_ratio != 1.0:
                squeeze_out = int(channel * self.squeeze_ratio)
                squeeze = self.make_squeeze_head(squeeze_out, index=layer)
                self.squeeze_heads.append(squeeze)
                input_channels_for_pred = squeeze_out
            else:
                input_channels_for_pred = channel

            # Need to calculate the intermediate heads
            if self.intermediate_channels != 1.0:
                intermediate_head = self.make_head(self.head_type, self.intermediate_channels, layer,role="inter")
                self.intermediate_heads.append(intermediate_head)

            A_per_layer = self.num_anchors_per_layer[layer]
            
            pred_head = self.make_head(self.head_type, A_per_layer * 4, index = layer, role = "pred")
            self.heads.append(pred_head)
        
    def make_head(self,head_type: str, out_channels: int, index: int, role: str):
        base = f"{self.name}_loc_{role}_{index}"
        if head_type == "conv3x3":
            return tf.keras.layers.Conv2D(filters=out_channels, kernel_size=3,padding="same",name=base)
        elif head_type == "depthwise":
            dw_name = f"{base}_dw"
            pw_name = f"{base}_pw"
            return tf.keras.Sequential([
                tf.keras.layers.DepthwiseConv2D(kernel_size = 3, padding="same",name=dw_name),
                tf.keras.layers.Conv2D(filters=out_channels, kernel_size=1,padding="same",name=pw_name)
            ],name=base)

    def make_squeeze_head(self,out_channels: int,index: int):
        base = f"{self.name}_loc_squeeze_{index}"
        return tf.keras.layers.Conv2D(filters=out_channels, kernel_size=1,padding="same",name=base)

    def make_normalization(self, normalization_type):
        if normalization_type == "BatchNorm":
            return tf.keras.layers.BatchNormalization(name = "loc_initial_normalization")
        elif normalization_type == "Norm":
            return tf.keras.layers.Normalization(name = "initial_normalization")
        


In [5]:
num_anchors_per_location = [4, 6, 6]
B = 2
P0 = tf.random.normal([B, 38, 38, 256])
P1 = tf.random.normal([B, 19, 19, 512])
P2 = tf.random.normal([B, 10, 10, 512])

feature_maps = [P0, P1, P2]

I0000 00:00:1764638352.540003  107630 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764638352.942265  107630 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764638352.942332  107630 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764638352.946042  107630 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764638352.946129  107630 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [6]:
loc_head = LocalizationHead(name="loc_head",num_anchors_per_location=num_anchors_per_location,head_type="conv3x3")

In [7]:
pred_loc = loc_head(feature_maps, training=False)

2025-12-01 20:19:14.259700: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
W0000 00:00:1764638354.434906  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.506683  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.507510  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.511919  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.516779  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.522582  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.527801  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638354.531915  107630 gpu_t

In [8]:
tf.shape(pred_loc)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([   2, 8542,    4], dtype=int32)>

In [9]:
H0, W0 = 38, 38
H1, W1 = 19, 19
H2, W2 = 10, 10

A0, A1, A2 = num_anchors_per_location
N_expected = H0 * W0 * A0 + H1 * W1 * A1 + H2 * W2 * A2
print("N_expected:", N_expected)

N_expected: 8542


In [10]:
assert pred_loc.shape[0] == B
assert pred_loc.shape[2] == 4
assert pred_loc.shape[1] == N_expected

In [11]:
loc_head_squeezed = LocalizationHead(name="loc_head_squeezed",num_anchors_per_location=num_anchors_per_location,head_type="conv3x3",squeeze_ratio=0.5,in_channels=[256, 512, 512])

In [12]:
pred_loc_sq = loc_head_squeezed(feature_maps, training=False)
assert pred_loc_sq.shape == pred_loc.shape, "Assert Failed"

W0000 00:00:1764638355.070273  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.071101  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.072039  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.072957  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.073679  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.074442  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.075181  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.075774  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.076428  107630 gp

In [13]:
loc_head_intermediate = LocalizationHead(name="loc_head_intermediate",num_anchors_per_location=num_anchors_per_location,head_type="conv3x3",intermediate_conv=128)

In [14]:
pred_loc_int = loc_head_intermediate(feature_maps, training=False)
assert pred_loc_int.shape == pred_loc.shape, "Assert Failed"

W0000 00:00:1764638355.290020  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.291040  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.292102  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.292907  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.293677  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.294480  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.295359  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.296075  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.296919  107630 gp

In [15]:
loc_head.summary()

In [16]:
loc_head_squeezed.summary()

In [17]:
loc_head_intermediate.summary()

In [18]:
loc_head.layers

[<BatchNormalization name=loc_initial_normalization, built=True>,
 <Conv2D name=loc_head_loc_pred_0, built=True>,
 <Conv2D name=loc_head_loc_pred_1, built=True>,
 <Conv2D name=loc_head_loc_pred_2, built=True>]

In [19]:
loc_head_squeezed = LocalizationHead(name="loc_head_squeezed",num_anchors_per_location=num_anchors_per_location,head_type="conv3x3",squeeze_ratio=0.5,in_channels=[256, 512, 512])

In [20]:
loc_head_squeezed.layers

[<BatchNormalization name=loc_initial_normalization, built=False>]

In [21]:
loc_head_intermediate.layers

[<BatchNormalization name=loc_initial_normalization, built=True>,
 <Conv2D name=loc_head_intermediate_loc_inter_0, built=True>,
 <Conv2D name=loc_head_intermediate_loc_pred_0, built=True>,
 <Conv2D name=loc_head_intermediate_loc_inter_1, built=True>,
 <Conv2D name=loc_head_intermediate_loc_pred_1, built=True>,
 <Conv2D name=loc_head_intermediate_loc_inter_2, built=True>,
 <Conv2D name=loc_head_intermediate_loc_pred_2, built=True>]

## Classification Head

In [22]:
class ClassificationHead(tf.keras.Layer):
    def __init__(self,name: str, num_anchors_per_location: list[int], number_of_classes: int, norm_cfg: str = "BatchNorm", head_type: str = "conv3x3",use_sigmoid: bool = False, **kwargs):
        super().__init__(name=name)

        # Stored the number of the classes
        self.number_of_classes = number_of_classes

        # Stored the head type
        self.head_type = head_type

        # Stored the anchors per layer
        self.num_anchors_per_location = num_anchors_per_location

        # Initial normalization strategy
        self.initial_norm = self.make_normalization(norm_cfg)

        # Squeeze Ratio
        self.squeeze_ratio = kwargs.get('squeeze_ratio',1.0)
        self.squeeze_blocks = []            

        # Intermediate Conv blocks
        self.intermediate_channels = kwargs.get('intermediate_conv',1)
        self.intermediate_blocks = None if self.intermediate_channels == 1 else []
        
        # Creating the final pred values
        self.final_heads = []

    def make_normalization(self, normalization_type):
        if normalization_type == "BatchNorm":
            return tf.keras.layers.BatchNormalization(name = "loc_initial_normalization")
        elif normalization_type == "Norm":
            return tf.keras.layers.Normalization(name = "initial_normalization")

    def call(self,feature_maps, training = False):
        outputs = []
        for layer, feature_map in enumerate(feature_maps):
            num_anchors = self.num_anchors_per_location[layer]
            C = self.number_of_classes
            x = feature_map

            # Initial Normalization Strategy
            if self.initial_norm != None and layer == 0:
                x = self.initial_norm(x,training = training)

            # Squeeze Ratio
            if self.squeeze_ratio != 1.0:
                x = self.squeeze_blocks[layer](x,training=training)

            # Intermediate Conv
            if self.intermediate_blocks != None:
                 x = self.intermediate_blocks[layer](x,training = training)

            # Final Predection
            x = self.final_heads[layer](x,training=training)

            # Reshape
            B = tf.shape(x)[0]
            H = tf.shape(x)[1]
            W = tf.shape(x)[2]

            # The shape must be (B,H,W,A*C)
            x = tf.reshape(x,[B,H,W,num_anchors,C])
            x = tf.reshape(x,[B,H * W * num_anchors,C])
            outputs.append(x)

        return tf.concat(outputs,axis=1)

    def build(self,input_shape):
        for layer,feature_map_shape in enumerate(input_shape):
            channel = int(feature_map_shape[-1])
            # Calculating the squeeze heads
            if self.squeeze_ratio != 1.0:
                squeeze_out = int(channel * self.squeeze_ratio)
                squeeze = self.create_head(out_channels = squeeze_out, index = layer, role="squeeze")
                self.squeeze_blocks.append(squeeze)
                input_channels_for_pred = squeeze_out
            else:
                input_channels_for_pred = channel

            # Calculate the intermediate heads
            if self.intermediate_channels != 1.0: 
                intermediate_head = self.create_head(out_channels = self.intermediate_channels, index = layer, role="inter")
                self.intermediate_blocks.append(intermediate_head)

            A_per_layer = self.num_anchors_per_location[layer]

            # Create final head
            pred_head = self.create_head(out_channels = A_per_layer * self.number_of_classes, index = layer, role = "pred")
            self.final_heads.append(pred_head)

    def create_pred_heads(self,anchors_per_layer: list[int]):
        heads = []
        for layer_number, anchors in enumerate(anchors_per_layer):
            # Creating the head based on the formula of the Ai * C
            head = self.create_head(anchors * self.number_of_classes,layer_number,"pred")
            heads.append(head)

        return heads

    def create_intermediate_heads(self,anchors_per_layer: list[int]):
        heads = []
        for layer_number in range(len(anchors_per_layer)):
            # Creating the head based on the formula of the Ai * C
            head = self.create_head(self.intermediate_channels,layer_number,"intermediate")
            heads.append(head)

        return heads

    def create_squeeze_heads(self,channels_per_layer: list[int]):
        heads = []
        for layer_number, channels in enumerate(channels_per_layer):
            # Creating the head based on the formula of the Ai * C
            out_channel = int(channels * self.squeeze_ratio)
            head = self.create_head(out_channel,layer_number,"squeeze")
            heads.append(head)

        return heads

    def create_head(self, out_channels: int, index: int, role: str):
        base = f"{self.name}_cls_{role}_{index}"
        if self.head_type == "conv3x3":
            return tf.keras.layers.Conv2D(filters=out_channels, kernel_size=3,padding="same",name=base)
        elif self.head_type == "dw":
            dw_name = f"{base}_dw"
            pw_name = f"{base}_pw"
            return tf.keras.Sequential([
                tf.keras.layers.DepthwiseConv2D(kernel_size = 3, padding="same",name=dw_name),
                tf.keras.layers.Conv2D(filters=out_channels, kernel_size=1,padding="same",name=pw_name)
            ],name=base)
        

In [23]:
num_anchors_per_location = [3, 6]
number_of_classes = 4 

In [24]:
B = 2
P0 = tf.random.normal((B, 2, 2, 8))   # (B, H0, W0, C0)
P1 = tf.random.normal((B, 1, 1, 16))  # (B, H1, W1, C1)
feature_maps = [P0, P1]

In [25]:
cls_head = ClassificationHead(
    name="cls_head",
    num_anchors_per_location=num_anchors_per_location,
    number_of_classes=number_of_classes,
    norm_cfg=None,
    head_type="conv3x3",
    use_sigmoid=False,
)

In [26]:
pred_logits = cls_head(feature_maps, training=False)

W0000 00:00:1764638355.702093  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.703297  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.706278  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.706896  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.707552  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.708774  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.709921  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.711069  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.712195  107630 gp

In [27]:
print(pred_logits.shape)

(2, 18, 4)


In [28]:
B = 1
num_anchors_per_location = [4, 6, 6]
number_of_classes = 21

In [29]:
P0 = tf.random.normal((B, 38, 38, 512))
P1 = tf.random.normal((B, 19, 19, 1024))
P2 = tf.random.normal((B, 10, 10, 512))
feature_maps = [P0, P1, P2]

In [30]:
cls_head = ClassificationHead(
    name="cls_head",
    num_anchors_per_location=num_anchors_per_location,
    number_of_classes=number_of_classes,
    norm_cfg=None,
    head_type="conv3x3",
    use_sigmoid=False,
)

In [31]:
pred_logits = cls_head(feature_maps, training=False)

W0000 00:00:1764638355.852163  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.853816  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.854955  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.856054  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.857261  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.858615  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.859703  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.861010  107630 gpu_timer.cc:114] Skipping the delay kernel, measurement accuracy will be reduced
W0000 00:00:1764638355.862561  107630 gp

In [32]:
print(pred_logits.shape)

(1, 8542, 21)


In [33]:
cls_head = ClassificationHead(
    name="cls_head",
    num_anchors_per_location=[3, 6],
    number_of_classes=4,
    norm_cfg="BatchNorm",
    head_type="dw", 
    use_sigmoid=True, 
    squeeze_ratio=0.5,
    in_channels=[8, 16],
    intermediate_channels=32,
)

In [34]:
P0 = tf.random.normal((2, 2, 2, 8))
P1 = tf.random.normal((2, 1, 1, 16))
pred_logits = cls_head([P0, P1], training=True)

In [35]:
print(pred_logits.shape) 

(2, 18, 4)
