In [1]:
import torch

In [2]:
from tools.train import rt_detr_config

device = torch.device("cuda")
model = rt_detr_config().model

Load PResNet18 state_dict


In [11]:
from powerlines.sahi import multiscale_image_patches

# Construct random test input
original_input = torch.randn((1, 3, 3000, 4096), dtype=torch.float)
patches = multiscale_image_patches(
    original_input, patch_sizes=[256, 512, 1024], step_size_fraction=0.8, predict_on_full_image=True
)
num_patch_inputs = len(patches.patches)

In [12]:
batch_size = 64
num_batches_per_frame = num_patch_inputs / batch_size
print(f"Num patches per frame: {num_patch_inputs}")
print(f"Num batches per frame: {num_batches_per_frame:.3f}")

401

In [None]:
input = torch.randn((batch_size, 3, 640, 640), dtype=torch.float).to(device)

# Timers
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
repetitions = int(1000 * num_batches_per_frame)
warm_up_rounds = 50
timings = torch.zeros((repetitions, 1))

In [None]:
with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=True):
    # GPU warm-up
    for _ in range(warm_up_rounds):
        _ = model(input)

    # Measure inference time
    with torch.no_grad():
        for rep in range(repetitions):
            starter.record()
            _ = model(input)
            ender.record()

            # Synchronize GPU
            torch.cuda.synchronize()
            elapsed_time = starter.elapsed_time(ender)
            timings[rep] = elapsed_time

In [None]:
# Compute inference time
mean_syn = timings.mean() * num_batches_per_frame
std_syn = timings.std()
print(f"Inference speed: {mean_syn:.3f} +- {std_syn:.3f}")