In [1]:
import os
os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [2]:
import tensorflow as tf
from typing import Any, Optional
from contextlib import contextmanager

2025-12-19 23:05:40.904226: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-19 23:05:40.925176: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-19 23:05:40.931229: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-19 23:05:40.946804: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from mobilenetv2ssd.core.config import load_config

In [4]:
main_cfg_path = "configs/train/default.yaml"
model_cfg_path = "configs/model/mobilenetv2_ssd_voc.yaml"
data_cfg_path = "configs/data/voc_224.yaml"
eval_cfg_path = "configs/eval/default.yaml"

In [5]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [6]:
config['train']['amp']

{'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}

In [7]:
class PrecisionConfig:
    def __init__(self, forced_precision: set[str]):
        self._forced_precision_fields = forced_precision

    def is_force_fp32_enabled(self, tag: str):
        if tag in self._forced_precision_fields:
            return True

        return False

In [8]:
class AMPContext:
    def __init__(self, config: dict[str, Any], optimizer: tf.keras.optimizers.Optimizer):
        self._enabled = config['enabled']
        self._policy = config['policy']
        self._loss_scale : str | float = config['loss_scale']
        self._clip_unscaled_grads = config['clip_unscaled_grads']
        self._force_fp32 = set(config['force_fp32'])
        self._policy_set = False
        self._base_optimizer = optimizer
        self.optimizer = optimizer # Use this optimzer only to handle the mixed precision

    def setup_policy(self):

        # Guarding against different strings
        if self._policy not in {'mixed_float16', 'mixed_bfloat16', 'float32'}:
            raise ValueError(f"AMP policy name is not valid, error value: {self._policy}, allowed values: {['mixed_float16', 'mixed_bfloat16', 'float32']}")
        
        # Check if the policy is set
        if self._policy_set:
            return
        
        # Need to check if amp is enabled or not
        if self._enabled:
            # Enable the global precision policy
            policy = tf.keras.mixed_precision.Policy(self._policy)
            tf.keras.mixed_precision.set_global_policy(policy)
            self._policy_set = True
            return
        else:
            # Setting it to float32 policy even though it is default behaviour
            policy = tf.keras.mixed_precision.Policy("float32")
            tf.keras.mixed_precision.set_global_policy(policy)
            self._policy_set = True
            return

    def wrap_optimizer(self):
        # Need to check if AMP is even on
        if not self._enabled:
            # AMP is off so return the optimzer with the base version
            self.optimizer = self._base_optimizer
            return self.optimizer

        # Now worrying about the loss scaling mode since AMP is on
        if self._loss_scale == "dynamic":
            
            # Wrapping the optimizer in the loss scale mode to allow for mixed precision
            self.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(self._base_optimizer)

        else:
            # It is a number and needs to be positive
            if isinstance(self._loss_scale, (int,float)) and self._loss_scale > 0:
                loss_scale_mode = float(self._loss_scale)
            else:
                raise ValueError("Loss Scale is invalid, needs to be 'dynamic' or a positive int or float")

            # TODO: Add support later by going through the documentation for fixed scaling
            self.optimizer = tf.keras.mixed_precision.LossScaleOptimizer(self._base_optimizer, initial_scale = loss_scale_mode)
        
        return self.optimizer

    def scale_loss(self, gradients):
        # Keeping this here for later if I decide to take control of the scaling and the clipping of the values
        return gradients
    
    @contextmanager
    def autocast(self):
        yield

    def state_metadata(self):
        return {
            'enabled': self._enabled,
            'policy': self._policy,
            'loss_scale': self._loss_scale,
            'clip_unscaled_grads': self._clip_unscaled_grads,
            'force_fp32': self._force_fp32
        }

    def make_precision_config(self):
        return PrecisionConfig(self._force_fp32)

In [9]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999)

I0000 00:00:1766203543.242758    7367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1766203543.329184    7367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1766203543.329250    7367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1766203543.330802    7367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1766203543.330863    7367 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [10]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [11]:
amp.state_metadata()

{'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': {'box_encode_decode', 'iou', 'loss_reduction', 'nms'}}

In [12]:
amp.wrap_optimizer()

<keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer at 0x77dc5cdbd8d0>

In [13]:
with amp.autocast():
    print(amp.optimizer)

<keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer object at 0x77dc5cdbd8d0>


## Test

### Test 1

In [14]:
tf.keras.mixed_precision.set_global_policy("float32")

In [15]:
opt = tf.keras.optimizers.Adam(1e-3)

In [16]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [17]:
amp.setup_policy()

In [18]:
global_policy_name = tf.keras.mixed_precision.global_policy().name

In [19]:
assert global_policy_name == "mixed_float16"

In [20]:
config = {'enabled': False,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [21]:
amp.setup_policy()

In [22]:
global_policy_name = tf.keras.mixed_precision.global_policy().name

In [23]:
assert global_policy_name == "float32"

### Test 2

In [24]:
opt = tf.keras.optimizers.Adam(1e-3)

In [25]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [26]:
amp.setup_policy()

In [27]:
amp.wrap_optimizer()

<keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer at 0x77dc5cdbf6a0>

In [28]:
assert amp.optimizer is not opt

### Test 3

In [29]:
opt = tf.keras.optimizers.Adam(1e-3)

In [30]:
config = {'enabled': False,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [31]:
amp.setup_policy()

In [32]:
amp.wrap_optimizer()

<keras.src.optimizers.adam.Adam at 0x77dc5cdc8ee0>

In [33]:
assert amp.optimizer is opt

### Test 4

In [34]:
tf.keras.mixed_precision.set_global_policy("float32")

In [35]:
opt = tf.keras.optimizers.Adam(1e-3)

In [36]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [37]:
amp.setup_policy()

In [38]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16,)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(4),
])

In [39]:
assert tf.keras.mixed_precision.global_policy().name == "mixed_float16"

In [40]:
if len(tf.config.list_physical_devices("GPU")) > 0:
    assert model.layers[0].dtype_policy.compute_dtype == "float16"
else:
    assert model.layers[0].dtype_policy is not None    

### Test 5

In [41]:
tf.keras.mixed_precision.set_global_policy("float32")

In [42]:
opt = tf.keras.optimizers.Adam(1e-3)

In [43]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [44]:
amp.setup_policy()

In [45]:
amp.wrap_optimizer()

<keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer at 0x77dc5cdca860>

In [46]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16,)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(4),
])

In [47]:
x = tf.random.normal([8, 16])
y = tf.random.normal([8, 4])

In [48]:
w0_before = tf.identity(model.trainable_variables[0])

In [49]:
with tf.GradientTape() as tape:
    with amp.autocast():
        y_pred = model(x, training=True)
        y_pred = tf.cast(y_pred, tf.float32)
        y_true = tf.cast(y, tf.float32)
        loss = tf.reduce_mean(tf.square(y_pred - y_true))

In [50]:
grads = tape.gradient(loss, model.trainable_variables)

In [51]:
none_count = sum(g is None for g in grads)

In [52]:
assert none_count == 0, f"Found {none_count} None gradients."

In [53]:
grads = amp.scale_loss(grads)

In [54]:
global_norm = tf.linalg.global_norm(grads)

In [55]:
assert tf.math.is_finite(global_norm), "Gradient norm is NaN/Inf."

In [56]:
assert float(global_norm.numpy()) > 0.0, "Gradient norm is zero (unexpected for random data)."

In [57]:
amp.optimizer.apply_gradients(zip(grads, model.trainable_variables))

<KerasVariable shape=(), dtype=int64, path=loss_scale_optimizer_2/iteration>

In [58]:
w0_after = model.trainable_variables[0]

In [59]:
delta = tf.reduce_sum(tf.abs(tf.cast(w0_after, tf.float32) - tf.cast(w0_before, tf.float32)))

In [60]:
assert float(delta.numpy()) > 0.0, "Weights did not change after apply_gradients()."

### Test 5

In [61]:
config = {'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}
amp = AMPContext(config, opt)

In [62]:
amp.setup_policy()

In [63]:
amp.wrap_optimizer()

<keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer at 0x77dc57f4d900>

In [64]:
precision_config = amp.make_precision_config()

In [65]:
precision_config._forced_precision_fields

{'box_encode_decode', 'iou', 'loss_reduction', 'nms'}

In [66]:
from mobilenetv2ssd.models.ssd.ops.encode_ops_tf import encode_boxes_core

In [67]:
priors_cxcywh = tf.constant([0.5, 0.5, 0.2, 0.2], dtype = tf.float32)
gt_xyxy =  tf.constant([
    [0.10, 0.10, 0.30, 0.30],  # GT 0  (class e.g. 3)
    [0.55, 0.55, 0.85, 0.85],  # GT 1  (class e.g. 2)
    [0.20, 0.50, 0.40, 0.80],  # GT 2  (class e.g. 5)
    [0.00, 0.00, 0.00, 0.00],  # padded
], dtype=tf.float32)
variance = (0.1,0.2)

In [68]:
encode_boxes_core(gt_xyxy,priors_cxcywh,variance, precision_config = precision_config)

<tf.Tensor: shape=(4, 4), dtype=float32, numpy=
array([[-1.5000000e+01, -1.5000000e+01,  5.9604639e-07,  5.9604639e-07],
       [ 1.0000002e+01,  1.0000002e+01,  2.0273256e+00,  2.0273256e+00],
       [-9.9999990e+00,  7.4999986e+00,  0.0000000e+00,  2.0273256e+00],
       [ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  0.0000000e+00]],
      dtype=float32)>

In [69]:
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(16,)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(4),
])

In [70]:
x = tf.random.normal([8, 16])
y = tf.random.normal([8, 4])

In [71]:
with tf.GradientTape() as tape:
    with amp.autocast():
        y_pred = model(x, training=True)

In [72]:
tf.keras.mixed_precision.global_policy().name

'mixed_float16'

In [73]:
type(amp.optimizer)

keras.src.optimizers.loss_scale_optimizer.LossScaleOptimizer

In [74]:
print("y_pred dtype:", y_pred.dtype)
print("var dtype:", model.trainable_variables[0].dtype)

y_pred dtype: <dtype: 'float16'>
var dtype: float32


## Factory Pattern

In [77]:
config = load_config(main_cfg_path,model_cfg_path,data_cfg_path,eval_cfg_path)

In [82]:
def build_amp_config(config: dict[str, Any]):
    train_config = config['train']
    amp_opts = train_config.get('amp',{})

    amp_config = {
        'enabled': amp_opts.get('enabled', False),
        'policy' : amp_opts.get('policy', "float32"),
        'loss_scale': amp_opts.get('loss_scale', 'dynamic'),
        'clip_unscaled_grads' : amp_opts.get('clip_unscaled_grads', True),
        'force_fp32': amp_opts.get('force_fp32', {}),
    }

    return amp_config

In [83]:
build_amp_config(config)

{'enabled': True,
 'policy': 'mixed_float16',
 'loss_scale': 'dynamic',
 'clip_unscaled_grads': True,
 'force_fp32': ['loss_reduction', 'box_encode_decode', 'iou', 'nms']}

In [84]:
def build_amp(config: dict[str, Any], optimizer: tf.keras.optimizers.Optimizer):
    # Build the AMP config
    amp_config = build_amp_config(config)

    # Build the AMP
    amp = AMPContext(amp_config, optimizer)

    return amp

In [86]:
amp = build_amp(config, opt)