# Palm Detector Tutorial
# Part 4 - Deploying the Palm Detector with Vitis-AI for DPU inference

## Goal

In this notebook we show how to deploy the BlazePalm model (a PyTorch version of the mediapipe hand-gesture)
with Vitis-AI for deployment on the DPU. We will cover:

* Create the PyTorch model and load pre-trained weights
* Quantizing and evaluating the PyTorch model
* Compiling for DPU using the Vitis AI compiler


## References

* [BlazePalm](https://github.com/vidursatija/BlazePalm)
* [Vitis AI v3.5](https://github.com/Xilinx/Vitis-AI/tree/v3.5)
   * [vai_q_pytorch](https://docs.xilinx.com/r/en-US/ug1414-vitis-ai/Running-vai_q_pytorch)
   * [vai_c_xir](https://docs.xilinx.com/r/en-US/ug1414-vitis-ai/Compiling-for-DPU)
* [DPU-PYNQ](https://github.com/Xilinx/DPU-PYNQ)


## Version History
* Jan 4, 2024
    * Preliminary revision

In [1]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision

print(torch.__version__)

1.13.1


In [2]:
# use GPU if available   
if (torch.cuda.device_count() > 0):
  print('You have',torch.cuda.device_count(),'CUDA devices available')
  for i in range(torch.cuda.device_count()):
    print(' Device',str(i),': ',torch.cuda.get_device_name(i))
  print('Selecting device 0..')
  device = torch.device('cuda:0')
else:
  print('No CUDA devices available..selecting CPU')
  device = torch.device('cpu')

# load trained model
#model = CNN().to(device)
#model.load_state_dict(torch.load(os.path.join(float_model,'f_model.pth')))

No CUDA devices available..selecting CPU


In [3]:


# Taken from https://github.com/vidursatija/BlazeFace-CoreML/blob/master/ML/blazeface.py
class ResModule(nn.Module):
	def __init__(self, in_channels, out_channels, stride=1):
		super(ResModule, self).__init__()
		self.stride = stride
		self.channel_pad = out_channels - in_channels
		# kernel size is always 3
		kernel_size = 3

		if stride == 2:
			self.max_pool = nn.MaxPool2d(kernel_size=stride, stride=stride)
			#self.pad = torchvision.transforms.Pad(padding=(0, 0, 2, 2), padding_mode="constant", fill=0) # [UNILOG][INFO] Total device subgraph number 10, DPU subgraph number 4
			#self.pad = torchvision.transforms.Pad(padding=(0, 0, 2, 2), padding_mode="symmetric", fill=0) # [UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
			self.pad = torchvision.transforms.Pad(padding=(0, 0, 2, 2), padding_mode="symmetric")
			padding = 0
		else:
			padding = (kernel_size - 1) // 2

		self.convs = nn.Sequential(
			nn.Conv2d(in_channels=in_channels, out_channels=in_channels, 
						kernel_size=kernel_size, stride=stride, padding=padding, 
						groups=in_channels, bias=True),
			nn.Conv2d(in_channels=in_channels, out_channels=out_channels, 
						kernel_size=1, stride=1, padding=0, bias=True),
		)

		self.act = nn.ReLU(inplace=True)

	def forward(self, x):
		if self.stride == 2:
			#h = F.pad(x, (0, 2, 0, 2), "constant", 0) # [UNILOG][INFO] Total device subgraph number 10, DPU subgraph number 4
			#h = F.pad(x, (0, 2, 0, 2), "reflect", 0) # [UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
			h = self.pad(x)
			x = self.max_pool(x)
		else:
			h = x

		if self.channel_pad > 0:
			x = F.pad(x, (0, 0, 0, 0, 0, self.channel_pad), "constant", 0) # never called ...

		return self.act(self.convs(h) + x)


class ResBlock(nn.Module):
	def __init__(self, in_channels):
		super(ResBlock, self).__init__()
		layers = [ResModule(in_channels, in_channels) for _ in range(7)]

		self.f = nn.Sequential(*layers)

	def forward(self, x):
		return self.f(x)


# From https://github.com/google/mediapipe/blob/master/mediapipe/models/palm_detection.tflite
class PalmDetector(nn.Module):
	def __init__(self):
		super(PalmDetector, self).__init__()

		self.backbone1 = nn.Sequential(
			nn.ConstantPad2d((0, 1, 0, 1), value=0.0),
			nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=2, padding=0, bias=True),
			nn.ReLU(inplace=True),

			ResBlock(32),
			ResModule(32, 64, stride=2),
			ResBlock(64),
			ResModule(64, 128, stride=2),
			ResBlock(128)
		)

		self.backbone2 = nn.Sequential(
			ResModule(128, 256, stride=2),
			ResBlock(256)
		)

		self.backbone3 = nn.Sequential(
			ResModule(256, 256, stride=2),
			ResBlock(256)
		)

		self.upscale8to16 = nn.Sequential(
			nn.ConvTranspose2d(in_channels=256, out_channels=256, kernel_size=2, stride=2, padding=0, bias=True),
			nn.ReLU(inplace=True)
		)
		self.scaled16add = ResModule(256, 256)

		self.upscale16to32 = nn.Sequential(
			nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=2, stride=2, padding=0, bias=True),
			nn.ReLU(inplace=True),
		)
		self.scaled32add = ResModule(128, 128)

		self.class_32 = nn.Conv2d(in_channels=128, out_channels=2, kernel_size=1, stride=1, padding=0, bias=True)
		self.class_16 = nn.Conv2d(in_channels=256, out_channels=2, kernel_size=1, stride=1, padding=0, bias=True)
		self.class_8 = nn.Conv2d(in_channels=256, out_channels=6, kernel_size=1, stride=1, padding=0, bias=True)

		self.reg_32 = nn.Conv2d(in_channels=128, out_channels=36, kernel_size=1, stride=1, padding=0, bias=True)
		self.reg_16 = nn.Conv2d(in_channels=256, out_channels=36, kernel_size=1, stride=1, padding=0, bias=True)
		self.reg_8 = nn.Conv2d(in_channels=256, out_channels=108, kernel_size=1, stride=1, padding=0, bias=True)


	def forward(self, x):
		b1 = self.backbone1(x) # 32x32
		# print(b1.size())

		b2 = self.backbone2(b1) # 16x16
		# print(b2.size())

		b3 = self.backbone3(b2) # 8x8
		# print(b3.size())

		b2 = self.upscale8to16(b3) + b2 # 16x16
		b2 = self.scaled16add(b2) # 16x16
		# print(b2.size())

		b1 = self.upscale16to32(b2) + b1 # 32x32
		b1 = self.scaled32add(b1)
		# print(b1.size())

		c8 = self.class_8(b3).permute(0, 2, 3, 1).reshape(-1, 384, 1)
		c16 = self.class_16(b2).permute(0, 2, 3, 1).reshape(-1, 512, 1)
		c32 = self.class_32(b1).permute(0, 2, 3, 1).reshape(-1, 2048, 1)

		r8 = self.reg_8(b3).permute(0, 2, 3, 1).reshape(-1, 384, 18)
		r16 = self.reg_16(b2).permute(0, 2, 3, 1).reshape(-1, 512, 18)
		r32 = self.reg_32(b1).permute(0, 2, 3, 1).reshape(-1, 2048, 18)

		c = torch.cat([c32, c16, c8], dim=1)
		r = torch.cat([r32, r16, r8], dim=1) # needs to be anchored

		return c, r

	def load_weights(self, path):
	    self.load_state_dict(torch.load(path))
	    self.eval()        

	def load_anchors(self, path):
	    self.anchors = torch.tensor(np.load(path), dtype=torch.float32)
	    assert(self.anchors.ndimension() == 2)
	    assert(self.anchors.shape[0] == 2944)
	    assert(self.anchors.shape[1] == 4)

	def _preprocess(self, x):
	    """Converts the image pixels to the range [-1, 1]."""
	    return x.float() / 127.5 - 1.0

	def _tensors_to_detections(self, raw_box_tensor, raw_score_tensor, anchors):
	    detection_boxes = self._decode_boxes(raw_box_tensor, anchors)
	    
	    thresh = 100
	    raw_score_tensor = raw_score_tensor.clamp(-thresh, thresh)
	    detection_scores = raw_score_tensor.sigmoid().squeeze(dim=-1)
	    
	    # Note: we stripped off the last dimension from the scores tensor
	    # because there is only has one class. Now we can simply use a mask
	    # to filter out the boxes with too low confidence.
	    mask = detection_scores >= 0.7

	    # Because each image from the batch can have a different number of
	    # detections, process them one at a time using a loop.
	    output_detections = []
	    for i in range(raw_box_tensor.shape[0]):
	        boxes = detection_boxes[i, mask[i]]
	        scores = detection_scores[i, mask[i]].unsqueeze(dim=-1)
	        output_detections.append(torch.cat((boxes, scores), dim=-1))

	    return output_detections

	def predict_on_image(self, img):
	    """Makes a prediction on a single image.
	    Arguments:
	        img: a NumPy array of shape (H, W, 3) or a PyTorch tensor of
	             shape (3, H, W). The image's height and width should be 
	             128 pixels.
	    Returns:
	        A tensor with face detections.
	    """
	    if isinstance(img, np.ndarray):
	        img = torch.from_numpy(img).permute((2, 0, 1))

	    return self.predict_on_batch(img.unsqueeze(0))

	def predict_on_batch(self, x):
	    """Makes a prediction on a batch of images.
	    Arguments:
	        x: a NumPy array of shape (b, H, W, 3) or a PyTorch tensor of
	           shape (b, 3, H, W). The height and width should be 128 pixels.
	    Returns:
	        A list containing a tensor of face detections for each image in 
	        the batch. If no faces are found for an image, returns a tensor
	        of shape (0, 17).
	    Each face detection is a PyTorch tensor consisting of 17 numbers:
	        - ymin, xmin, ymax, xmax
	        - x,y-coordinates for the 6 keypoints
	        - confidence score
	    """
	    if isinstance(x, np.ndarray):
	        x = torch.from_numpy(x).permute((0, 3, 1, 2))

	    assert x.shape[1] == 3
	    assert x.shape[2] == 256
	    assert x.shape[3] == 256

	    # 1. Preprocess the images into tensors:
	    # x = x.to(self._device())
	    x = self._preprocess(x)

	    # 2. Run the neural network:
	    with torch.no_grad():
	        out = self.__call__(x)

	    # 3. Postprocess the raw predictions:
	    detections = self._tensors_to_detections(out[1], out[0], self.anchors)

	    # 4. Non-maximum suppression to remove overlapping detections:
	    filtered_detections = []
	    for i in range(len(detections)):
	        faces = self._weighted_non_max_suppression(detections[i])
	        if len(faces) > 0:
		        faces = torch.stack(faces)
		        filtered_detections.append(faces)

	    return filtered_detections

	def _decode_boxes(self, raw_boxes, anchors):
	    """Converts the predictions into actual coordinates using
	    the anchor boxes. Processes the entire batch at once.
	    """
	    boxes = torch.zeros_like(raw_boxes)

	    x_center = raw_boxes[..., 0] / 256 * anchors[:, 2] + anchors[:, 0]
	    y_center = raw_boxes[..., 1] / 256 * anchors[:, 3] + anchors[:, 1]

	    w = raw_boxes[..., 2] / 256 * anchors[:, 2] * 2.6
	    h = raw_boxes[..., 3] / 256 * anchors[:, 3] * 2.6

	    y_center = y_center - h / 5.2

	    boxes[..., 0] = x_center - w / 2.  # ymin
	    boxes[..., 1] = y_center - h / 2.  # xmin
	    boxes[..., 2] = x_center + w / 2.  # ymax
	    boxes[..., 3] = y_center + h / 2.  # xmax

	    for k in range(7):
	        offset = 4 + k*2
	        keypoint_x = raw_boxes[..., offset    ] / 256 * anchors[:, 2] + anchors[:, 0]
	        keypoint_y = raw_boxes[..., offset + 1] / 256 * anchors[:, 3] + anchors[:, 1]
	        boxes[..., offset    ] = keypoint_x
	        boxes[..., offset + 1] = keypoint_y

	    return boxes

	def _weighted_non_max_suppression(self, detections):
	    """The alternative NMS method as mentioned in the BlazeFace paper:
	    "We replace the suppression algorithm with a blending strategy that
	    estimates the regression parameters of a bounding box as a weighted
	    mean between the overlapping predictions."
	    The original MediaPipe code assigns the score of the most confident
	    detection to the weighted detection, but we take the average score
	    of the overlapping detections.
	    The input detections should be a Tensor of shape (count, 17).
	    Returns a list of PyTorch tensors, one for each detected face.
	    
	    This is based on the source code from:
	    mediapipe/calculators/util/non_max_suppression_calculator.cc
	    mediapipe/calculators/util/non_max_suppression_calculator.proto
	    """
	    if len(detections) == 0: return []

	    output_detections = []

	    # Sort the detections from highest to lowest score.
	    remaining = torch.argsort(detections[:, 18], descending=True)

	    while len(remaining) > 0:
	        detection = detections[remaining[0]]

	        # Compute the overlap between the first box and the other 
	        # remaining boxes. (Note that the other_boxes also include
	        # the first_box.)
	        first_box = detection[:4]
	        other_boxes = detections[remaining, :4]
	        ious = overlap_similarity(first_box, other_boxes)

	        # If two detections don't overlap enough, they are considered
	        # to be from different faces.
	        mask = ious >= 0.3
	        overlapping = remaining[mask]
	        remaining = remaining[~mask]

	        # Take an average of the coordinates from the overlapping
	        # detections, weighted by their confidence scores.
	        weighted_detection = detection.clone()
	        if len(overlapping) > 1:
	            coordinates = detections[overlapping, :18]
	            scores = detections[overlapping, 18:19]
	            total_score = scores.sum()
	            weighted = (coordinates * scores).sum(dim=0) / total_score
	            weighted_detection[:18] = weighted
	            weighted_detection[18] = total_score / len(overlapping)

	        output_detections.append(weighted_detection)

	    return output_detections



In [4]:
model = PalmDetector()
model.load_weights("../BlazePalm/ML/palmdetector.pth")
model.load_anchors('../BlazePalm/ML/anchors.npy')
model.eval()


PalmDetector(
  (backbone1): Sequential(
    (0): ConstantPad2d(padding=(0, 1, 0, 1), value=0.0)
    (1): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2))
    (2): ReLU(inplace=True)
    (3): ResBlock(
      (f): Sequential(
        (0): ResModule(
          (convs): Sequential(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
            (1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          )
          (act): ReLU(inplace=True)
        )
        (1): ResModule(
          (convs): Sequential(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
            (1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          )
          (act): ReLU(inplace=True)
        )
        (2): ResModule(
          (convs): Sequential(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32)
            (1): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1))
          )

## 2. Inspection
In order to determine if the model is supported by Vitis-AI, we start with the Vitis-AI Inspector.

In [5]:
from pytorch_nndct.apis import Inspector


[0;32m[VAIQ_NOTE]: Loading NNDCT kernels...[0m


No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'


In [6]:
inspector = Inspector("DPUCZDX8G_ISA1_B4096")
batchsize = 100
rand_in = torch.randn([batchsize, 3,256,256])
inspector.inspect(model, (rand_in), device=device)


[0;32m[VAIQ_NOTE]: Inspector is on.[0m

[0;32m[VAIQ_NOTE]: =>Start to inspect model...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing PalmDetector...[0m

[0;32m[VAIQ_NOTE]: Start to trace and freeze model...[0m

[0;32m[VAIQ_NOTE]: The input model nndct_st_PalmDetector_ed is torch.nn.Module.[0m

[0;32m[VAIQ_NOTE]: Finish tracing.[0m

[0;32m[VAIQ_NOTE]: Processing ops...[0m


██████████████████████████████████████████████████| 218/218 [00:00<00:00, 1041.56it/s, OpInfo: name = return_0, type = Return]                             


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_FLOAT_OP]: The quantizer recognize new op `nndct_index` as a float operator by default.[0m






[0;32m[VAIQ_NOTE]: =>Doing weights equalization...[0m

[0;32m[VAIQ_NOTE]: =>Quantizable module is generated.(quantize_result/PalmDetector.py)[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone2]/ResModule[0]/ret.155) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/ret.103) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ConstantPad2d[0]/ret.3) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/ret.51) can't be fused[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_18:
node name:PalmDetector::PalmDetector/Sequential[upscale8to16]/ConvTranspose2d[0]/ret.255, op type:nndct_conv_transpose_2d, output shape: [100, 16, 16, 256]
node name:PalmDetector::PalmDetector/Sequential[upscale8to16]/ReLU[1]/18945, op type:nndct_relu, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPE


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/Sequential[convs]/Conv2d[1]/ret.107's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/ret.103's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[6]/ret.47, op type:nndct_elemwise_add, output shape: [100, 128, 128, 32]
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[6]/ReLU[act]/17175, op type:nndct_relu, output shape: [100, 128, 128, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of


[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone3]/ResModule[0]/ret.211, op type:nndct_elemwise_add, output shape: [100, 8, 8, 256]
node name:PalmDetector::PalmDetector/Sequential[backbone3]/ResModule[0]/ReLU[act]/18616, op type:nndct_relu, output shape: [100, 8, 8, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone3]/ResModule[0]/Sequential[convs]/Conv2d[1]/ret.209's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone3]/ResModule[0]/MaxPool2d[max_pool]/18571's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/Res

I20240105 09:03:09.011950  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.011976  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.011983  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.012048  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_conv_transpose_2d_nndct_relu_vo0NQbdsu5MYzayr, with op num: 9
I20240105 09:03:09.012053  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:09.025827  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240105 09:03:09.025861  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:09.031064  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.031086  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.031090  


[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[2]/ret.23, op type:nndct_elemwise_add, output shape: [100, 128, 128, 32]
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[2]/ReLU[act]/16999, op type:nndct_relu, output shape: [100, 128, 128, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[2]/Sequential[convs]/Conv2d[1]/ret.21's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[1]/ReLU[act]/16955's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[1]/ret.61's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/ReLU[act]/17307's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[3]/ret.29, op type:nndct_elemwise_add, output shape: [100, 128, 128, 32]
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[3]/ReLU[act]/17043, op type:nndct_relu, output shape: [100, 128, 128, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR


[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[0]/ret.11, op type:nndct_elemwise_add, output shape: [100, 128, 128, 32]
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[0]/ReLU[act]/16911, op type:nndct_relu, output shape: [100, 128, 128, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[1]/ret.9's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ReLU[2]/16867's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for eltwise_fix_23:
node n

I20240105 09:03:09.226804  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.226809  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.226866  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_elemwise_add_nndct_relu_O58Xn6W0kAGQlS1x, with op num: 7
I20240105 09:03:09.226871  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:09.230829  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 4, DPU subgraph number 1
I20240105 09:03:09.230886  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:09.236680  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.236701  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.236704  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I2024010


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::input_0's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for reshape_fix_1:
node name:PalmDetector::PalmDetector/ret.303, op type:nndct_reshape, output shape: [100, 512, 18]

[0m

[0;32m[VAIQ_NOTE]: Find subgraph for reshape_fix_1:
node name:PalmDetector::PalmDetector/ret.285, op type:nndct_reshape, output shape: [100, 512, 1]

[0m

[0;32m[VAIQ_NOTE]: Find subgraph for reshape_fix_1:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/Pad[pad]/17666_sink_transpose_1, op type:nndct_permute, output shape: [100, 64, 64, 64]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/Pad[pad]/17666's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[V


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[2]/Sequential[convs]/Conv2d[1]/ret.73, op type:nndct_conv2d, output shape: [100, 64, 64, 64]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[2]/Sequential[convs]/Conv2d[0]/ret.71's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[6]/Sequential[convs]/Conv2d[1]/ret.251, op type:nndct_conv2d, output shape: [100, 8, 8, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[6]/Sequential[con

I20240105 09:03:09.436836  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.436857  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.436861  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.436930  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_pad_nd_AQ72Frv8ytn9DqIh, with op num: 4
I20240105 09:03:09.436934  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:09.438992  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 2, DPU subgraph number 0
I20240105 09:03:09.439009  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:09.446224  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.446239  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.446244  1532 compile_pass_mana


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[4]/Sequential[convs]/Conv2d[1]/ret.33, op type:nndct_conv2d, output shape: [100, 128, 128, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[4]/Sequential[convs]/Conv2d[0]/ret.31's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Conv2d[class_32]/ret.287, op type:nndct_conv2d, output shape: [100, 32, 32, 2]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/ResModule[scaled32add]/ReLU[act]/19060's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone2]/ResModule[0]/Sequential[convs]/Conv2d[1]/ret.159, op type:nndct_conv2d, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone2]/ResModule[0]/Sequential[convs]/Conv2d[0]/ret.157's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[7]/Sequential[f]/ResModule[6]/Sequential[convs]/Conv2d[1]/ret.149, op type:nndct_conv2d, output shape: [100, 32, 32, 128]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[7]/Sequential[f]/ResModule[6]/Sequential[convs]/Conv2d[0]/ret.147's  shape is 100 > 1 which

I20240105 09:03:09.640933  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.640954  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.640959  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.641024  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_conv2d_KgqY7GEOSe3b9ZCs, with op num: 8
I20240105 09:03:09.641028  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:09.647094  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240105 09:03:09.647121  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:09.651388  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.651407  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.651409  1532 compile_pass_mana


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone2]/ResBlock[1]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[1]/ret.165, op type:nndct_conv2d, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone2]/ResBlock[1]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[0]/ret.163's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[4]/Sequential[convs]/Conv2d[1]/ret.85, op type:nndct_conv2d, output shape: [100, 64, 64, 64]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[4]/Sequential[


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[3]/Sequential[f]/ResModule[5]/Sequential[convs]/Conv2d[0]/ret.37's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Conv2d[class_8]/ret.275, op type:nndct_conv2d, output shape: [100, 8, 8, 6]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[6]/ReLU[act]/18924's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_20:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[5]/Sequential[convs]/Conv2d[1]/ret.91, op type:nndct_conv2d, output shape: [100, 64, 64, 64

I20240105 09:03:09.844753  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.844758  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.844841  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_conv2d_1JprxsRaKXM5SoCE, with op num: 8
I20240105 09:03:09.844847  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:09.849435  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240105 09:03:09.849463  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:09.854197  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:09.854214  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:09.854218  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:09.854279


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[5]/Sequential[convs]/Conv2d[0]/ret.243, op type:nndct_depthwise_conv2d, output shape: [100, 8, 8, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[4]/ReLU[act]/18836's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/ResModule[scaled16add]/Sequential[convs]/Conv2d[0]/ret.259, op type:nndct_depthwise_conv2d, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/ret.257's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: 


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[0]/ret.59, op type:nndct_depthwise_conv2d, output shape: [100, 64, 64, 64]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/ReLU[act]/17307's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone2]/ResBlock[1]/Sequential[f]/ResModule[1]/Sequential[convs]/Conv2d[0]/ret.169, op type:nndct_depthwise_conv2d, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone2]/ResBlock[1]/Sequential[f]/ResModule[0]/ReLU[act]/18231's  shape is 100 > 1 w

I20240105 09:03:10.049814  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:10.049819  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:10.049882  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_conv2d_Mkdexmo03Q7BU2bK, with op num: 8
I20240105 09:03:10.049886  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:10.054078  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240105 09:03:10.054102  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:10.065196  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:10.065223  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:10.065228  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:10.065313


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[7]/Sequential[f]/ResModule[3]/ReLU[act]/17923's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone2]/ResBlock[1]/Sequential[f]/ResModule[0]/Sequential[convs]/Conv2d[0]/ret.163, op type:nndct_depthwise_conv2d, output shape: [100, 16, 16, 256]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone2]/ResModule[0]/ReLU[act]/18187's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone3]/ResBlock[1]/Sequential[f]/ResModule[4]/Sequential[convs]/Conv2d[0]/ret.237, op type:n


[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/Sequential[convs]/Conv2d[0]/ret.53, op type:nndct_depthwise_conv2d, output shape: [100, 64, 64, 32]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/Pad[pad]/ret.49_swim_transpose_0's  shape is 100 > 1 which will be set to 1 temporarily for pattern matching.[0m

[0;32m[VAIQ_NOTE]: Find subgraph for convlike_fix_21:
node name:PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[4]/Sequential[convs]/Conv2d[0]/ret.83, op type:nndct_depthwise_conv2d, output shape: [100, 64, 64, 64]

[0m

[0;33m[VAIQ_WARN][QUANTIZER_TORCH_INSPECTOR_PATTERN]: The First dimension of pattern data node PalmDetector::PalmDetector/Sequential[backbone1]/ResBlock[5]/Sequential[f]/ResModule[3]/ReLU[act]/17483's  shape is 100 > 1 which will b

I20240105 09:03:10.259683  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:10.259686  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03:10.259743  1532 compile_pass_manager.cpp:465] [UNILOG][INFO] Graph name: nndct_depthwise_conv2d_STZauU1QOnWbXtLi, with op num: 8
I20240105 09:03:10.259747  1532 compile_pass_manager.cpp:478] [UNILOG][INFO] Begin to compile...
I20240105 09:03:10.264166  1532 compile_pass_manager.cpp:489] [UNILOG][INFO] Total device subgraph number 3, DPU subgraph number 1
I20240105 09:03:10.264201  1532 compile_pass_manager.cpp:504] [UNILOG][INFO] Compile done.
I20240105 09:03:10.269677  1532 compile_pass_manager.cpp:352] [UNILOG][INFO] Compile mode: dpu
I20240105 09:03:10.269698  1532 compile_pass_manager.cpp:353] [UNILOG][INFO] Debug mode: null
I20240105 09:03:10.269702  1532 compile_pass_manager.cpp:357] [UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
I20240105 09:03


[0;32m[VAIQ_NOTE]: =>Finish inspecting.[0m


## 3. Quantization
In order to compile the trained model for deployment on a DPU platform, we must first quantize it. Here we will use the `vitis_quantize` module to convert the floating point model into an INT8 quantized representation. 

***Quantize - Calibration Phase***

**Quantize model**

By default the `quantize_model` function converts the weights, activations and inputs into 8-bit wide numbers. We can specify different values and configurations using `weight_bit`, `activation_bit` and other parameters. 

In [7]:
from pytorch_nndct.apis import torch_quantizer, dump_xmodel

In [8]:
quant_mode = 'calib'
batchsize = 100
quant_model = './quant_model'

In [9]:
# force to merge BN with CONV for better quantization accuracy
optimize = 1

# override batchsize if in test mode
if (quant_mode=='test'):
  batchsize = 1
  
rand_in = torch.randn([batchsize, 3,256,256])
quantizer = torch_quantizer(quant_mode, model, (rand_in), output_dir=quant_model, device=device) 
quantized_model = quantizer.quant_model


[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- albertabeef-HP-Z4-G4-Workstation
              release --- 5.14.0-1055-oem
              version --- #62-Ubuntu SMP Wed Nov 30 04:54:03 UTC 2022
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 7.5.0
               python --- 3.8.6
              pytorch --- 1.13.1
        vai_q_pytorch --- 3.5.0+60df3f1+torch1.13.1[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization calibration process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing PalmDetector...[0m

[0;32m[VAIQ_NOTE]: Start to trace and freeze model...[0m

[0;32m[VAIQ_NOTE]: The input model nndct_st_PalmDetector_ed is torch.nn.Module.[0m

[0;32m[VAIQ_NOTE]: Finish tracing.[0m

[0;32m[VAIQ_NOTE]: Processing

██████████████████████████████████████████████████| 218/218 [00:00<00:00, 1022.24it/s, OpInfo: name = return_0, type = Return]                             


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_FLOAT_OP]: The quantizer recognize new op `nndct_index` as a float operator by default.[0m






[0;32m[VAIQ_NOTE]: =>Doing weights equalization...[0m

[0;32m[VAIQ_NOTE]: =>Quantizable module is generated.(./quant_model/PalmDetector.py)[0m

[0;32m[VAIQ_NOTE]: =>Get module with quantization.[0m


**Evaluate quantized model**

In order to evaluate the quantized model, it needs to be re-compiled with the desired loss and evaluation metrics, such as accuracy. Since we are using 8-bit quantization we do not lose much performance, if at all.

In [10]:
#acc1_gen, acc5_gen, loss_gen = evaluate(quantized_model, val_loader, loss_fn)

# reference : https://github.com/Xilinx/Vitis-AI/blob/master/src/vai_quantizer/vai_q_pytorch/example/resnet18_quant.py
quantized_model.eval()
quantized_model = quantized_model.to(device)
inputs = torch.randn([batchsize, 3,256,256])
outputs = quantized_model(inputs)

  bnfp[1] = stats.mode(data)[0][0]


In [11]:
if quant_mode == 'calib':
    quantizer.export_quant_config()



[0;32m[VAIQ_NOTE]: =>Exporting quant config.(./quant_model/quant_info.json)[0m


***Quantize - Test Phase***

**Save quantized model**

Once we are happy with the performance of the quantized model, we can save it as a .h5 file, simply using the `save` method.

In [12]:
quant_mode = 'test'
batchsize = 1
quant_model = './quant_model'

In [13]:
rand_in = torch.randn([batchsize, 3,256,256])
quantizer = torch_quantizer(quant_mode, model, (rand_in), output_dir=quant_model, device=device) 
quantized_model = quantizer.quant_model


[0;32m[VAIQ_NOTE]: OS and CPU information:
               system --- Linux
                 node --- albertabeef-HP-Z4-G4-Workstation
              release --- 5.14.0-1055-oem
              version --- #62-Ubuntu SMP Wed Nov 30 04:54:03 UTC 2022
              machine --- x86_64
            processor --- x86_64[0m

[0;32m[VAIQ_NOTE]: Tools version information:
                  GCC --- GCC 7.5.0
               python --- 3.8.6
              pytorch --- 1.13.1
        vai_q_pytorch --- 3.5.0+60df3f1+torch1.13.1[0m

[0;32m[VAIQ_NOTE]: Quant config file is empty, use default quant configuration[0m

[0;32m[VAIQ_NOTE]: Quantization test process start up...[0m

[0;32m[VAIQ_NOTE]: =>Quant Module is in 'cpu'.[0m

[0;32m[VAIQ_NOTE]: =>Parsing PalmDetector...[0m

[0;32m[VAIQ_NOTE]: Start to trace and freeze model...[0m

[0;32m[VAIQ_NOTE]: The input model nndct_st_PalmDetector_ed is torch.nn.Module.[0m

[0;32m[VAIQ_NOTE]: Finish tracing.[0m

[0;32m[VAIQ_NOTE]: Processing ops...

██████████████████████████████████████████████████| 218/218 [00:00<00:00, 1092.79it/s, OpInfo: name = return_0, type = Return]                             


[0;33m[VAIQ_WARN][QUANTIZER_TORCH_FLOAT_OP]: The quantizer recognize new op `nndct_index` as a float operator by default.[0m






[0;32m[VAIQ_NOTE]: =>Doing weights equalization...[0m

[0;32m[VAIQ_NOTE]: =>Quantizable module is generated.(./quant_model/PalmDetector.py)[0m

[0;32m[VAIQ_NOTE]: =>Get module with quantization.[0m


In [14]:
# reference : https://github.com/Xilinx/Vitis-AI/blob/master/src/vai_quantizer/vai_q_pytorch/example/resnet18_quant.py
quantized_model.eval()
quantized_model = quantized_model.to(device)
inputs = torch.randn([batchsize, 3,256,256])
outputs = quantized_model(inputs)

In [15]:
quantizer.export_torch_script()

  output = caller(*args, **kwargs)
  if not (list(self.node.out_tensors[0].shape[1:]) == list(input.size())[1:]):
  if inf.sum() > 0 or nan.sum() > 0:
  if (isinstance(tensor, torch.Tensor) and



[0;32m[VAIQ_NOTE]: PalmDetector_int.pt is generated.(quantize_result/PalmDetector_int.pt)[0m


PalmDetector(
  original_name=PalmDetector
  (module_0): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_1): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_2): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_3): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_4): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_5): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_6): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_7): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_8): deephi_Input(original_name=deephi_Input)
  (module_9): DeephiFuncModule(original_name=DeephiFuncModule)
  (module_10): deephi_Conv2d(original_name=deephi_Conv2d)
  (module_11): deephi_ReLU(original_name=deephi_ReLU)
  (module_12): deephi_Conv2d(original_name=deephi_Conv2d)
  (module_13): deephi_Conv2d(original_name=deephi_Conv2d)
  (module_14): deephi_Add(original_name=deephi_Add)
  (module_15): deephi_ReLU(original_name=deephi_ReLU)
 

In [16]:
quantizer.export_onnx_model()

  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)



[0;32m[VAIQ_NOTE]: PalmDetector_int.onnx is generated.(quantize_result/PalmDetector_int.onnx)[0m


  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [17]:
quantizer.export_xmodel(deploy_check=False)


[0;32m[VAIQ_NOTE]: =>Converting to xmodel ...[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone2]/ResModule[0]/ret.155) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[6]/ret.103) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ConstantPad2d[0]/ret.3) can't be fused[0m

[0;33m[VAIQ_WARN]: Pad(PalmDetector::PalmDetector/Sequential[backbone1]/ResModule[4]/ret.51) can't be fused[0m





[0;32m[VAIQ_NOTE]: =>Successfully convert 'PalmDetector' to xmodel.(quantize_result/PalmDetector_int.xmodel)[0m


## 4. Compilation

For this final step we use the Vitis AI compiler `vai_c_pytorch` and pass the quantized model as a parameter. In this example we are compiling the DPU model targeting the KV260 board, however to target a different board you will just have to point the compiler to the right `arch.json` file. 

For example, for the ZCU104 you would pass

`--arch /opt/vitis_ai/compiler/arch/DPUCZDX8G/ZCU104/arch.json`

and for Ultra96, we can pass the custom arch.json in this repository

`--arch ./arch.json`

In [18]:
# vai_c_xir -x /PATH/TO/quantized.xmodel -a /PATH/TO/arch.json -o /OUTPUTPATH -n netname

In [19]:
!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B4096/arch-zcu104.json \
    -o ./model_blazepalm/B4096 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B3136/arch-kv260.json \
    -o ./model_blazepalm/B3136 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B2304/arch-b2304-lr.json \
    -o ./model_blazepalm/B2304 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B1152/arch-b1152-hr.json \
    -o ./model_blazepalm/B1152 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B512/arch-b512-lr.json \
    -o ./model_blazepalm/B512 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/B128/arch-b128-lr.json \
    -o ./model_blazepalm/B128 \
    -n palm_detector



**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[m[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: null
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B4096
[UNILOG][INFO] Graph name: PalmDetector, with op num: 750
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B4096/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B4096/palm_detector.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is c73a012007ce06b308c9b74b1ed4de4f, and has been saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B4096/md5sum.txt"
**************************************************
* VITIS_AI Compilation - Xilinx Inc.
*************************

[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B2304/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B2304/palm_detector.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is 5b21b56fd829dd4576827baa14368957, and has been saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B2304/md5sum.txt"
**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[m[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: null
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B1152_0101000017010203
[UNILOG][INFO] Graph name: PalmDetector, with op num: 750
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile 

[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B512/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B512/palm_detector.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is ac14753e02d594e1ba65d1798e7bc5ce, and has been saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/B512/md5sum.txt"
**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[m[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: null
[UNILOG][INFO] Target architecture: DPUCZDX8G_ISA1_B128_0101000002010208
[UNILOG][INFO] Graph name: PalmDetector, with op num: 750
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile done

In [20]:
!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/C20B14/arch-c20b14.json \
    -o ./model_blazepalm/C20B14 \
    -n palm_detector

!vai_c_xir \
    -x ./quantize_result/PalmDetector_int.xmodel \
    -a ./arch/C20B1/arch-c20b1.json \
    -o ./model_blazepalm/C20B1 \
    -n palm_detector

**************************************************
* VITIS_AI Compilation - Xilinx Inc.
**************************************************
[m[UNILOG][INFO] Compile mode: dpu
[UNILOG][INFO] Debug mode: null
[UNILOG][INFO] Target architecture: DPUCV2DX8G_ISA1_C20B14
[UNILOG][INFO] Graph name: PalmDetector, with op num: 750
[UNILOG][INFO] Begin to compile...
[m[UNILOG][INFO] Total device subgraph number 15, DPU subgraph number 5
[UNILOG][INFO] Compile done.
[UNILOG][INFO] The meta json is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/C20B14/meta.json"
[UNILOG][INFO] The compiled xmodel is saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/C20B14/palm_detector.xmodel"
[UNILOG][INFO] The compiled xmodel's md5sum is fa23bf1ee19e4fb451462188bc6f0426, and has been saved to "/workspace/blazepalm_tutorial/vitis_ai/./model_blazepalm/C20B14/md5sum.txt"
**************************************************
* VITIS_AI Compilation - Xilinx Inc.
********************

----

Copyright (C) 2024 Avnet, Inc

SPDX-License-Identifier: Apache-2.0 License

----

----