#Demo of AsymFormer: Evaluation and Inference Speed Test
Note: The entire testing process must be conducted on an Ubuntu operating system with support for pycuda and TensorRT. We recommend using Pytorch>=2.0, Cuda>=12.0 to run the speed test. The inference speed reported in the paper was tested on RTX 3090 platform, with Ubuntu 20.04, Cuda 12.0, Pytorch 2.0.1, opencv-python 4.5.5.64, TensorRT 8.6.0 and pycuda 2022.2.2.

#Step.1: Import necessary packages and define the data transform functions.

In [None]:
"""Import necessary packages"""
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import torch
import torchvision
import time
from torch.utils.data import DataLoader
import datetime
import cv2
import NYUv2_dataloader as Data
from utils.utils import intersectionAndUnion, AverageMeter, accuracy, macc
import tensorrt as trt
import pycuda.driver as cuda

"""Set the image size in inference"""
image_w = 640
image_h = 480

"""Data Transform: Resize, ToTensor and Normalization"""
# transform
class scaleNorm(object):
    def __call__(self, sample):
        image, depth, label = sample['image'], sample['depth'], sample['label']

        label = label.astype(np.int16)
        # Bi-linear
        image = cv2.resize(image, (image_w, image_h), cv2.INTER_LINEAR)
        # Nearest-neighbor
        depth = cv2.resize(depth, (image_w, image_h), cv2.INTER_NEAREST)
        label = cv2.resize(label, (image_w, image_h), cv2.INTER_NEAREST)

        return {'image': image, 'depth': depth, 'label': label}


class ToTensor(object):
    """Convert ndarrays in sample to Tensors."""

    def __call__(self, sample):
        image, depth, label = sample['image'], sample['depth'], sample['label']

        image = image.transpose((2, 0, 1))
        depth = np.expand_dims(depth, 0)
        return {'image': torch.from_numpy(image).float(),
                'depth': torch.from_numpy(depth).float(),
                'label': torch.from_numpy(label).float()}


class Normalize(object):
    def __call__(self, sample):
        image, depth = sample['image'], sample['depth']
        origin_image = image.clone()
        origin_depth = depth.clone()
        image = image / 255

        image = torchvision.transforms.Normalize(mean=[0.4850042694973687, 0.41627756261047333, 0.3981809741523051],
                                                 std=[0.26415541082494515, 0.2728415392982039, 0.2831175140191598])(
            image)

        depth = torchvision.transforms.Normalize(mean=[2.8424503515351494], std=[0.9932836506164299])(depth)
        sample['origin_image'] = origin_image
        sample['origin_depth'] = origin_depth
        sample['image'] = image
        sample['depth'] = depth

        return sample

#Setp.2: Load the prprocessed TensorRT model.


In [None]:
f = open("AsymFormer.engine", "rb")                     # Open the TensorRT model. In this case, the model is put in same folder as this jupyter notebook.
runtime = trt.Runtime(trt.Logger(trt.Logger.WARNING))   #  Sets up a TensorRT runtime engine with a warning-level logger
engine = runtime.deserialize_cuda_engine(f.read())      # Load TensorRT inference engine from the '.engine' file.

"""creates an execution context object that corresponds to the TensorRT engine. 
The execution context will be used later on to execute inference tasks on the engine, using the defined optimized model and settings."""
context = engine.create_execution_context()             

"""Setup I/O bindings"""
inputs = []
outputs = []
allocations = []

"""Allocate memory for the input and output"""
for i in range(engine.num_bindings): 
    is_input = False
    if engine.binding_is_input(i):
        is_input = True
    name = engine.get_binding_name(i)
    dtype = engine.get_binding_dtype(i)
    shape = engine.get_binding_shape(i)
    if is_input:
        batch_size = shape[0]
    size = np.dtype(trt.nptype(dtype)).itemsize
    for s in shape:
        size *= s

    allocation = cuda.mem_alloc(size)
    
    binding = {
        'index': i,
        'name': name,
        'dtype': np.dtype(trt.nptype(dtype)),
        'shape': list(shape),
        'allocation': allocation,
    }
    
    allocations.append(allocation)
    if engine.binding_is_input(i):
        inputs.append(binding)
    else:
        outputs.append(binding)

print(inputs)
print(outputs)
print(allocations)


#Step.3: Do a single image inference, to test the necessary packages have been imported correctly and the TensorRT model have been loaded. 

In [None]:
"""Before we """
stream = cuda.Stream()
# Prepare the output data
output = np.zeros(outputs[0]['shape'], outputs[0]['dtype'])

def asy_infer(batch,output):
    """
    Execute inference on a batch of images. The images should already be batched and preprocessed, as prepared by
    the ImageBatcher class. Memory copying to and from the GPU device will be performed here.
    :param batch: A numpy array holding the image batch.
    :param top: The number of classes to return as top_predicitons, in descending order by their score. By default,
    setting to one will return the same as the maximum score class. Useful for Top-5 accuracy metrics in validation.
    :return: Three items, as numpy arrays for each batch image: The maximum score class, the corresponding maximum
    score, and a list of the top N classes and scores.
    """
    # Process I/O and execute the network
    cuda.memcpy_htod_async(inputs[0]['allocation'], np.ascontiguousarray(batch[0]),stream)
    cuda.memcpy_htod_async(inputs[1]['allocation'], np.ascontiguousarray(batch[1]),stream)
    
    context.execute_async_v2(allocations,stream.handle,None)
    
    cuda.memcpy_dtoh_async(output, outputs[0]['allocation'],stream)
    stream.synchronize()
    return output

In [None]:
"""Create the validation dataloader"""
val_data = Data.RGBD_Dataset(transform=torchvision.transforms.Compose([scaleNorm(),
                                                                       ToTensor(),
                                                                       Normalize()]),
                             phase_train=False,
                             data_dir='../NYUv2_numpy', # The file path of the NYUv2 dataset
                             txt_name='test.txt'    # Data split. In evaluation, use the test.txt
                             )
val_loader = DataLoader(val_data, batch_size=1, shuffle=False, num_workers=0, pin_memory=True)

In [None]:
img=val_data[0]['image'].numpy().astype(np.float32)
depth=val_data[0]['depth'].numpy().astype(np.float32)
label=val_data[0]['label'].numpy().astype(np.float32)
batch=[img,depth]

In [None]:
"""Conduct a single image inference. """
torch.cuda.synchronize()
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)

"""Run the inference and record the time cost
The inference speed will vary if you run this code block multiple times. 
You will observe that the initial run is slower compared to subsequent runs. 
This is because the GPU is in power-saving mode when it is not under any workload. 
Step 4 demonstrates the use of warm-up inference to prompt the GPU to exit power-saving mode.
"""
starter.record()
out=asy_infer(batch,output)
ender.record()
torch.cuda.synchronize()

curr_time = starter.elapsed_time(ender)
print('Frame Per Second (FPS)ï¼š',1000/curr_time)

In [None]:
"""Show the visualization of the inference result"""
from matplotlib import pyplot as plt
pred=out.argmax(axis=1)
pred=pred.squeeze(axis=0)
plt.imshow(pred)

#Step.4: Inference speed test, without evaluation.

In [None]:
t = 0
acc_collect = []
torch.cuda.synchronize()
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
timings = np.zeros((len(val_loader), 1))

"""Caeate dummy inputs of rgb and depth, to run the warm-up"""
img=np.random.rand(1,3,480,640).astype(np.float32)
depth=np.random.rand(1,1,480,640).astype(np.float32)
batch=[img,depth]

with torch.no_grad():
    """Run a warm-up inference, giving the GPU a workload to exit power-saving mode"""
    for _ in range(50):
        _ = asy_infer(batch,output)

    """Run the inference speed test"""
    for batch_idx, sample in enumerate(val_loader):

        image = sample['image'].numpy().astype(np.float32)
        depth = sample['depth'].numpy().astype(np.float32)
        batch=[image,depth]

        """Run the inference and record the time cost"""
        starter.record()
        pred =asy_infer(batch,output)
        ender.record()
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[batch_idx] = curr_time

"""Output the inference speed of AsymFormer"""
print('Average Inference speed (ms): ', timings[2:].sum() / 653)
print('Frame Per Second (FPS):',1000/(timings[2:].sum() / 653))



#Step.5: Evaluation
This code will output both inference speed and quantitative evaluation result.

In [None]:
acc_meter = AverageMeter()
intersection_meter = AverageMeter()
union_meter = AverageMeter()
a_meter = AverageMeter()
b_meter = AverageMeter()
t = 0
acc_collect = []

torch.cuda.synchronize()
starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True)
timings = np.zeros((len(val_loader), 1))

img=np.random.rand(1,3,480,640).astype(np.float32)
depth=np.random.rand(1,1,480,640).astype(np.float32)

with torch.no_grad():
    for _ in range(50):
        _ = asy_infer(batch,output)

    for batch_idx, sample in enumerate(val_loader):

        image = sample['image'].numpy().astype(np.float32)
        depth = sample['depth'].numpy().astype(np.float32)
        label = sample['label'].numpy()
        batch=[image,depth]
        
        starter.record()
        pred =asy_infer(batch,output)
        ender.record()
        torch.cuda.synchronize()
        curr_time = starter.elapsed_time(ender)
        timings[batch_idx] = curr_time

        out=pred.argmax(axis=1)+1
        out=out.squeeze(axis=0)

        acc, pix = accuracy(out, label)
        acc_collect.append(acc)
        intersection, union = intersectionAndUnion(out, label, 40)
        acc_meter.update(acc, pix)
        a_m, b_m = macc(out, label, 40)
        intersection_meter.update(intersection)
        union_meter.update(union)
        a_meter.update(a_m)
        b_meter.update(b_m)
        print('[{}] iter {}, accuracy: {}'
              .format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                      batch_idx, acc))


iou = intersection_meter.sum / (union_meter.sum + 1e-10)
for i, _iou in enumerate(iou):
    print('class [{}], IoU: {}'.format(i, _iou))

mAcc = (a_meter.average() / (b_meter.average() + 1e-10))
print(mAcc.mean())
print('[Eval Summary]:')
print('Mean IoU: {:.4}, Accuracy: {:.2f}%'
      .format(iou.mean(), acc_meter.average() * 100))
print('Average Inference speed (ms): ', timings[2:].sum() / 653)
print('Frame Per Second (FPS):',1000/(timings[2:].sum() / 653))