In [1]:
import pandas as pd
from typing import Iterable, Literal, overload
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import cv2 as cv
import numpy as np
import math
import os

In [2]:
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [3]:
import tensorflow as tf

2025-11-29 19:17:16.605163: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-11-29 19:17:16.648585: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-11-29 19:17:16.661830: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-11-29 19:17:16.729591: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Implementing the Building Blocks For Loss Calculations

### Localization Losses

#### Smooth L1 Loss

In [4]:
def smooth_l1_loss(predicted_values, target, beta, reduction = "sum"):

    # Calculate the difference between the two values
    difference = predicted_values - target
    absolute_difference = tf.math.abs(difference)

    # Masking which values require L1 and L2
    small_mask = absolute_difference < beta
    large_mask = tf.logical_not(small_mask)
    
    # Calculate where the formula needs to change
    errors = tf.where(small_mask, 0.5*(difference**2)/beta,  difference)
    errors = tf.where(large_mask, absolute_difference - (0.5*beta),errors)

    # Sum over the four coordinates
    errors = tf.reduce_sum(errors,axis=-1)

    # Reduction strategy
    if reduction == "sum":
        loss = tf.reduce_sum(errors)
    elif reduction == "max":
        loss = tf.reduce_max(errors)
    elif reduction == "mean":
        loss = tf.reduce_mean(errors)
    else:
        loss = errors
    
    return loss

In [5]:
pred = tf.constant([
    [0.0,  0.0,  0.0,  0.0],   # box 0
    [0.2, -0.4,  1.2, -2.0],   # box 1
], dtype=tf.float32)
target = tf.constant([
    [0.0,  0.0,  0.0,  0.0],   # box 0
    [0.0,  0.0,  0.0,  0.0],   # box 1
], dtype=tf.float32)
beta = 1.0

I0000 00:00:1764461839.480739   72728 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764461839.582107   72728 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764461839.582170   72728 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764461839.584047   72728 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1764461839.584102   72728 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [6]:
smooth_l1_loss(pred,target,beta, reduction="sum")

<tf.Tensor: shape=(), dtype=float32, numpy=2.3>

In [7]:
smooth_l1_loss(pred,target,beta, reduction="none")

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0. , 2.3], dtype=float32)>

#### L1 Loss

In [8]:
def l1_loss(predicted_values,target,reduction = "sum"):

    # Calculate the difference between the pred and the actual values
    difference = predicted_values - target
    absolute_difference = tf.math.abs(difference)

    # Sum over the four coordinates
    errors = tf.reduce_sum(absolute_difference,axis=-1)

    # Reduction strategy
    if reduction == "sum":
        loss = tf.reduce_sum(errors)
    elif reduction == "max":
        loss = tf.reduce_max(errors)
    elif reduction == "mean":
        loss = tf.reduce_mean(errors)
    else:
        loss = errors
    
    return loss

In [9]:
pred = tf.constant([
    [0.0,  0.0,  0.0,  0.0],
    [0.2, -0.4,  1.2, -2.0],
], dtype=tf.float32)

target = tf.zeros_like(pred)

In [10]:
l1_loss(pred,target, reduction="sum")

<tf.Tensor: shape=(), dtype=float32, numpy=3.8000002>

In [11]:
l1_loss(pred,target, reduction="none")

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.       , 3.8000002], dtype=float32)>

#### L2 Loss

In [12]:
def l2_loss(predicted_values, target, reduction= "sum"):

    # Calculate the difference between the pred and the actual values
    difference = predicted_values - target
    squared_difference = tf.square(difference)

    # Sum over the four coordinates
    errors = tf.reduce_sum(squared_difference,axis=-1)

    # Reduction strategy
    if reduction == "sum":
        loss = tf.reduce_sum(errors)
    elif reduction == "max":
        loss = tf.reduce_max(errors)
    elif reduction == "mean":
        loss = tf.reduce_mean(errors)
    else:
        loss = errors
    
    return loss

In [13]:
pred = tf.constant([[1., 2., 3., 4.],
                    [0., 1., 0., 1.]], dtype=tf.float32)
tgt  = tf.constant([[0., 0., 0., 0.],
                    [0., 0., 0., 0.]], dtype=tf.float32)

In [14]:
l2_loss(pred, tgt, reduction="none")

<tf.Tensor: shape=(2,), dtype=float32, numpy=array([30.,  2.], dtype=float32)>

In [15]:
l2_loss(pred, tgt, reduction="sum")

<tf.Tensor: shape=(), dtype=float32, numpy=32.0>

In [16]:
l2_loss(pred, tgt, reduction="mean")

<tf.Tensor: shape=(), dtype=float32, numpy=16.0>

### Classification Losses

In [17]:
def softmax_cross_entropy_loss(logits: tf.Tensor, labels: tf.Tensor, reduction: str ="none"):

    logits = tf.cast(logits, tf.float32)
    labels = tf.cast(labels, tf.int32)

    # Calculating the shifted values
    max_per_row = tf.reduce_max(logits, axis=-1,keepdims = True)

    # Shifted logits from stopping exponential values from going extremely large
    shifted_logits = logits - max_per_row
    shifted_logits = tf.cast(shifted_logits,tf.float32)
    
    # Exponential Values
    exponential_shifted_logits = tf.math.exp(shifted_logits)
    sum_exponential_shifted_logits = tf.reduce_sum(exponential_shifted_logits, axis= -1, keepdims = True)

    # Calculating probabilities
    log_probabilities = tf.math.log(sum_exponential_shifted_logits)
    log_probabilities = shifted_logits - log_probabilities

    # Calculating for the labels
    rows = tf.range(tf.shape(logits)[0])
    index = tf.stack([rows,labels], axis=1)
    probs = tf.gather_nd(log_probabilities, index)
    per_class_loss = -probs

    if reduction == "sum":
        return tf.reduce_sum(per_class_loss)
    elif reduction == "mean":
        num = tf.cast(tf.size(per_class_loss),tf.float32)
        sum_probs = tf.reduce_sum(per_class_loss)
        return tf.math.divide_no_nan(sum_probs, num)
    else:
        return per_class_loss

In [18]:
logits = tf.constant(
[
  [ 2.0,  0.5, -1.0 ],
  [ 0.0,  0.0,  0.0 ],
  [ 1.0,  2.0,  3.0 ]
]
)
labels = tf.constant([0, 1, 2])


In [19]:
softmax_cross_entropy_loss(logits,labels,reduction="sum")

<tf.Tensor: shape=(), dtype=float32, numpy=1.7475296>

In [20]:
softmax_cross_entropy_loss(logits,labels,reduction="mean")

<tf.Tensor: shape=(), dtype=float32, numpy=0.5825099>

In [21]:
logits = tf.constant([
  [ 2.0,  0.5, -1.2 ],
  [-1.0,  3.0,  0.2 ], 
  [ 0.0, -0.2,  1.0 ],
  [ 4.5,  0.0, -2.0 ], 
  [ 1.2,  2.2, -3.0 ]]
)
labels = tf.constant([0, 1, 2, 0, 1])


In [22]:
softmax_cross_entropy_loss(logits,labels,reduction="sum")

<tf.Tensor: shape=(), dtype=float32, numpy=1.1524363>

In [23]:
softmax_cross_entropy_loss(logits,labels,reduction="mean")

<tf.Tensor: shape=(), dtype=float32, numpy=0.23048726>

In [24]:
softmax_cross_entropy_loss(logits,labels,reduction="none")

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.23419617, 0.07615124, 0.5122688 , 0.01253359, 0.31728652],
      dtype=float32)>

## Implementing the Multibox Loss

In [25]:
pred_loc = tf.constant(
[
  [  # image 0
    [ 0.10, -0.10,  0.20, -0.20 ],   # anchor 0
    [ 0.00,  0.00,  0.00,  0.00 ],   # anchor 1
    [ 0.30, -0.30,  0.10, -0.10 ],   # anchor 2
  ]
]
)

pred_logits = tf.constant(
    [
  [
    [ 2.0,  0.5, -1.0 ],   # a0
    [ 3.0, -1.0,  0.0 ],   # a1
    [ 0.0, -0.5,  2.0 ],   # a2
  ]
]
)

tgt_loc = tf.constant(
     [
  [
    [ 0.00,  0.00,  0.00,  0.00 ],   # a0 matched to some GT
    [ 0.10, -0.10,  0.10, -0.10 ],   # a1 is background, these values ignored
    [ 0.25, -0.25,  0.00,  0.00 ],   # a2 matched to some GT
  ]
]
)

tgt_labels = tf.constant(
    [
  [ 1, 0, 2 ]   # a0 is class 1, a1 is bg, a2 is class 2

]
)

pos_mask = tf.constant(
    [
  [ True, False, True ]   # a0 & a2 are positives
]
)

neg_mask = tf.constant(
     [
  [ False, True, False ]  # a1 is a mined negative
]
)

In [26]:
def multibox_loss(predicted_offsets: tf.Tensor, predicted_logits: tf.Tensor, target_offsets: tf.Tensor, target_labels: tf.Tensor, positive_mask: tf.Tensor, negative_mask: tf.Tensor, localization_weight: float, classification_weight: float,beta: float|None ,cls_loss_type: str ="softmax_ce", loc_loss_type: str = "smooth_l1", normalize_denom: str = "num_pos", reduction: str = "sum"):
    # Calculate the mask for classification of anchors
    classification_mask = tf.logical_or(positive_mask,negative_mask)

    # Calculate the number of positives and number of negative boxes
    number_of_positives = tf.reduce_sum(tf.cast(positive_mask,tf.int32))
    number_of_negatives = tf.reduce_sum(tf.cast(negative_mask,tf.int32))

    # Calculating Safe values
    number_of_positives = tf.maximum(1,number_of_positives)
    number_of_negatives = tf.maximum(1,number_of_negatives)
    number_of_classifications = number_of_positives + number_of_negatives

    # Flattening the masks
    B = tf.shape(positive_mask)[0]
    N = tf.shape(positive_mask)[-1]

    positive_mask_flattened = tf.reshape(positive_mask,[-1])
    negative_mask_flattened = tf.reshape(negative_mask,[-1])
    classification_mask_flattened = tf.reshape(classification_mask, [-1])

    # Flattened Offsets
    predicted_offsets_flattened = tf.reshape(predicted_offsets,[-1,4])
    target_offsets_flattened = tf.reshape(target_offsets,[-1,4])

    # Flattening the Logits
    C = tf.shape(predicted_logits)[-1]
    predicted_logits_flattened = tf.reshape(predicted_logits,[-1,C])

    # Flattening the Labels
    labels_flattened = tf.reshape(target_labels,[-1])

    # Masking the offsets
    positive_offsets_flattened = tf.boolean_mask(predicted_offsets_flattened,positive_mask_flattened)
    negative_offsets_flattened = tf.boolean_mask(predicted_offsets_flattened,negative_mask_flattened)
    positive_targets_flattened = tf.boolean_mask(target_offsets_flattened,positive_mask_flattened)

    # Masking the logits
    positive_logits_flattened = tf.boolean_mask(predicted_logits_flattened, positive_mask_flattened)
    negative_logits_flattened = tf.boolean_mask(predicted_logits_flattened, negative_mask_flattened)
    
    # Selecting the anchors that are used in the classification task
    selected_prediction_logits = tf.boolean_mask(predicted_logits_flattened,classification_mask_flattened)
    selected_prediction_targets = tf.boolean_mask(labels_flattened,classification_mask_flattened)

    # Calculating the losses for the model (Localization + Classification)

    # The classification loss looks ath both the positive and negative anchors in the model
    if cls_loss_type == "softmax_ce":
        classification_raw = softmax_cross_entropy_loss(selected_prediction_logits,selected_prediction_targets,reduction=reduction)

    
    # The localization loss only looks at the positive 
    if loc_loss_type == "smooth_l1":
        if beta != None:
            localization_raw = smooth_l1_loss(positive_offsets_flattened,positive_targets_flattened,beta = beta, reduction=reduction)
        else:
            localization_raw = smooth_l1_loss(positive_offsets_flattened,positive_targets_flattened,beta = 1.0, reduction=reduction)
        
    elif loc_loss_type == "l1_loss":
        localization_raw = l1_loss(positive_offsets_flattened,positive_targets_flattened,reduction=reduction)
    elif loc_loss_type == "l2_loss":
        localization_raw = l2_loss(positive_offsets_flattened,positive_targets_flattened,reduction=reduction)

    # Normalize the losses
    # Localization loss looks at the number of positives
    if normalize_denom == "num_neg":
        localization_loss = tf.math.divide(localization_raw,tf.cast(number_of_negatives,dtype=tf.float32))
        classification_loss = tf.math.divide(classification_raw,tf.cast(number_of_negatives,dtype=tf.float32))
    elif normalize_denom == "num_cls":
        localization_loss = tf.math.divide(localization_raw,tf.cast(number_of_classifications,dtype=tf.float32))
        classification_loss = tf.math.divide(classification_raw,tf.cast(number_of_classifications,dtype=tf.float32))
    elif normalize_denom == "num_batch":
        localization_loss = tf.math.divide(localization_raw,tf.cast(number_of_positives,dtype=tf.float32))
        classification_loss = tf.math.divide(classification_raw,tf.cast(B,dtype=tf.float32))
    else:
        localization_loss = tf.math.divide(localization_raw,tf.cast(number_of_positives,dtype=tf.float32))
        classification_loss = tf.math.divide(classification_raw,tf.cast(number_of_positives,dtype=tf.float32))

    # Adding the losses using the weights
    multibox_loss = (localization_weight * localization_loss) + (classification_weight * classification_loss)
    
    return {
        'total_loss': multibox_loss,
        'loc_loss': localization_loss,
        'cls_loss': classification_loss,
        'num_pos': number_of_positives,
        'num_negative': number_of_negatives
    }

In [27]:
multibox_loss(pred_loc, pred_logits,tgt_loc,tgt_labels, pos_mask,neg_mask,beta=1, localization_weight = 1, classification_weight = 1, loc_loss_type="smooth_l1")  

{'total_loss': <tf.Tensor: shape=(), dtype=float32, numpy=1.0332147>,
 'loc_loss': <tf.Tensor: shape=(), dtype=float32, numpy=0.03125>,
 'cls_loss': <tf.Tensor: shape=(), dtype=float32, numpy=1.0019647>,
 'num_pos': <tf.Tensor: shape=(), dtype=int32, numpy=2>,
 'num_negative': <tf.Tensor: shape=(), dtype=int32, numpy=1>}