# About face detect demo
This demo will show you: how to build the face detection processing pipeline accelerated by TensorRT through the BMF framework. After encapsulating the face detection processing workflow into a BMF module, you can use a small amount of code to build a face detection processing pipeline BMF Graph, which reflects the good compatibility of the BMF framework with AI deep learning frameworks such as Torch and TensorRT.

# Install

## 1. Install TensorRT
First, we need to install TensorRT's binary and python API

In [None]:
!mkdir -p trt
%cd trt
!wget https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-12.0.tar.gz
%env version=8.6.1.6
%env arch=x86_64
%env cuda=cuda-12.0
!tar -xzvf TensorRT-${version}.Linux.${arch}-gnu.${cuda}.tar.gz
!python3 -m pip install --upgrade pip
%cd TensorRT-8.6.1.6/python
!python3 -m pip install tensorrt-8.6.1-cp310-none-linux_x86_64.whl
!python3 -m pip install tensorrt_lean-8.6.1-cp310-none-linux_x86_64.whl
!python3 -m pip install tensorrt_dispatch-8.6.1-cp310-none-linux_x86_64.whl
%cd -
%cd TensorRT-8.6.1.6/uff
!python3 -m pip install uff-0.6.9-py2.py3-none-any.whl
%cd -
%cd TensorRT-8.6.1.6/graphsurgeon
!python3 -m pip install graphsurgeon-0.4.6-py2.py3-none-any.whl
%cd -
%cd TensorRT-8.6.1.6/onnx_graphsurgeon
!python3 -m pip install onnx_graphsurgeon-0.3.12-py2.py3-none-any.whl
%cd ..
!rm -rf python uff graphsurgeon onnx_graphsurgeon
%cd /content
%env LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/content/trt/TensorRT-8.6.1.6/lib
%env LD_LIBRARY_PATH=/content/trt/TensorRT-8.6.1.6/lib:/usr/lib64-nvidia:/content/trt/TensorRT-8.6.1.6/lib
%env PATH=/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/content/trt/TensorRT-8.6.1.6/bin

## Configure environment variables for TRT

In [None]:
%env LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/content/trt/TensorRT-8.6.1.6/lib
%env LD_LIBRARY_PATH=/content/trt/TensorRT-8.6.1.6/lib:/usr/lib64-nvidia:/content/trt/TensorRT-8.6.1.6/lib
%env PATH=/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/content/trt/TensorRT-8.6.1.6/bin

## pip install BMF packages

In [None]:
!pip install BabitMF-GPU

## install wurlitzer

This package is installed to show the BMF C++ logs in the colab console, otherwise only python logs are printed. This step is not necessary if you're not in a Colab or iPython notebook environment.

In [None]:
!pip install wurlitzer
%load_ext wurlitzer

# 2. BMF processing pipeline programming
*   Download the Face detect model and use TRT to convert it to .engine
*   Implement BMF Module of face detection.
*  Display the results.

## Download the Face detect model and use TRT to convert it to .engine

In [None]:
!wget https://github.com/BabitMF/bmf/releases/download/files/models.tar.gz
!tar -zvxf models.tar.gz
!cp models/version-RFB-640.onnx .
!trtexec --onnx=version-RFB-640.onnx --buildOnly --saveEngine=version-RFB-640.engine

## Implement BMF Module of face detection.

Implement the face detection Module, which obtains the decoded frame output by the Decoder, converts pix_fmt to RGB format and sends it to the detection model, then filters the results and draws a bounding box in each of the output frame.

In [None]:
%%writefile /content/trt_face_detect.py

import tensorrt as trt
import torch
import torch.nn.functional as F
import numpy as np
import sys
import time

if sys.version_info.major == 2:
    from Queue import Queue
else:
    from queue import Queue

import PIL
from PIL import Image

sys.path.append("../../")

from bmf import *
import bmf.hmp as mp

def NMS(bounding_boxes, confidence_score, threshold = 0.5):
    if len(bounding_boxes) == 0:
        return [], []

    boxes = np.array(bounding_boxes)

    start_x = boxes[:, 0]
    start_y = boxes[:, 1]
    end_x = boxes[:, 2]
    end_y = boxes[:, 3]

    score = np.array(confidence_score)

    picked_boxes = []
    picked_score = []

    areas = (end_x - start_x + 1) * (end_y - start_y + 1)

    order = np.argsort(score)

    while order.size > 0:
        index = order[-1]

        picked_boxes.append(bounding_boxes[index])
        picked_score.append(confidence_score[index])

        x1 = np.maximum(start_x[index], start_x[order[:-1]])
        x2 = np.minimum(end_x[index], end_x[order[:-1]])
        y1 = np.maximum(start_y[index], start_y[order[:-1]])
        y2 = np.minimum(end_y[index], end_y[order[:-1]])

        w = np.maximum(0.0, x2 - x1 + 1)
        h = np.maximum(0.0, y2 - y1 + 1)
        intersection = w * h

        ratio = intersection / (areas[index] + areas[order[:-1]] - intersection)

        left = np.where(ratio < threshold)
        order = order[left]

    return picked_boxes, picked_score

class trt_face_detect(Module):

    def __init__(self, node=None, option=None):
        self.node_ = node
        self.option_ = option

        if option is None:
            Log.log(LogLevel.ERROR, "Option is none")
            return

        if "model_path" in option.keys():
            self.model_path_ = option["model_path"]

        if "label_to_frame" in option.keys():
            self.label_frame_flag_ = option["label_to_frame"]

        if "input_shapes" in option.keys():
            self.input_shapes_ = option["input_shapes"]

        start_time = time.time()

        logger = trt.Logger(trt.Logger.ERROR)
        with open(self.model_path_, 'rb') as f:
            engine_buffer = f.read()
        self.engine_ = trt.Runtime(logger).deserialize_cuda_engine(
            engine_buffer)

        if self.engine_ is None:
            Log.log(LogLevel.ERROR, "Failed building engine!")
            return
        Log.log(LogLevel.INFO, "Succeeded building engine!")

        self.num_io_tensors_ = self.engine_.num_io_tensors
        self.tensor_names_ = [
            self.engine_.get_tensor_name(i)
            for i in range(self.num_io_tensors_)
        ]
        self.num_inputs_ = [self.engine_.get_tensor_mode(self.tensor_names_[i]) for i in range(self.num_io_tensors_)] \
                           .count(trt.TensorIOMode.INPUT)
        assert self.num_inputs_ == len(
            self.input_shapes_.keys()
        ), "The number of input_shapes doesn't match the number of model's inputs."
        self.num_outputs_ = [self.engine_.get_tensor_mode(self.tensor_names_[i]) for i in range(self.num_io_tensors_)] \
                           .count(trt.TensorIOMode.OUTPUT)

        self.context_ = self.engine_.create_execution_context()
        self.stream_ = mp.current_stream(mp.kCUDA)

        for i in range(self.num_inputs_):
            self.context_.set_input_shape(
                self.tensor_names_[0],
                self.input_shapes_[self.tensor_names_[0]])

        self.output_dict_ = dict()
        for i in range(self.num_inputs_, self.num_io_tensors_):
            self.output_dict_[self.tensor_names_[i]] = mp.empty(
                self.context_.get_tensor_shape(self.tensor_names_[i]),
                device=mp.kCUDA,
                dtype=self.to_scalar_types(
                    self.engine_.get_tensor_dtype(self.tensor_names_[i])))

        self.frame_cache_ = Queue()
        self.in_frame_num_ = 1
        self.out_frame_num_ = 1

        self.eof_received_ = False

        Log.log(LogLevel.ERROR, "Load model takes", (time.time() - start_time))

    def reset(self):
        self.eof_received_ = False
        while not self.frame_cache_.empty():
            self.frame_cache_.get()

    def to_scalar_types(self, trt_dtype):
        dtype_map = {
            trt.float32: mp.kFloat32,
            trt.float16: mp.kHalf,
            trt.int32: mp.kInt32,
            trt.int8: mp.kInt8,
            trt.uint8: mp.kUInt8,
        }
        return dtype_map[trt_dtype]

    def pre_process(self, torch_image_array):
        input_shape = list(self.input_shapes_.values())[0]
        # input shape is the shape of trt engine
        batch = input_shape[0]
        channel = input_shape[1]
        width = input_shape[3]
        height = input_shape[2]

        input_tensor = torch.stack(torch_image_array).float()
        input_tensor = torch.permute(input_tensor, [0, 3, 1, 2])
        input_tensor = F.interpolate(input_tensor,
                                     size=(height, width),
                                     mode='bilinear')

        torch_mean = torch.empty((1, 3, 1, 1), device="cuda").fill_(0.5)
        torch_std = torch.empty((1, 3, 1, 1), device="cuda").fill_(0.5)

        input_tensor = (input_tensor / 255 - torch_mean) / torch_std

        return input_tensor

    def post_process(self, input_pil_arrays, boxes, scores):
        output_list = []
        boxes_data = []
        scores_data = []
        for image_id in range(len(input_pil_arrays)):

            image = input_pil_arrays[image_id]
            output_data = []
            for index in range(len(boxes[image_id])):
                if (scores[image_id][index][1]) > 0.8:
                    box = (boxes[image_id][index])
                    x1 = int(box[0] * image.size[0])
                    y1 = int(box[1] * image.size[1])
                    x2 = int(box[2] * image.size[0])
                    y2 = int(box[3] * image.size[1])
                    boxes_data.append([x1, y1, x2, y2])
                    scores_data.append(scores[image_id][index][1])

            nms_boxes, nms_scores = NMS(boxes_data, scores_data)
            output_list.append(nms_boxes)
        return output_list

    def label_frame(self, input_frames, pil_image_array, detect_result_list):
        from PIL import ImageDraw
        output_frame_list = []
        for index_frame in range(len(pil_image_array)):
            image = pil_image_array[index_frame]
            draw = ImageDraw.Draw(image)
            for index_box in range(len(detect_result_list[index_frame])):
                detect_result = detect_result_list[index_frame][index_box]
                draw.rectangle([
                    detect_result[0], detect_result[1], detect_result[2],
                    detect_result[3]
                ])
            del draw
            numpy_image = np.asarray(image)
            H420 = mp.PixelInfo(mp.kPF_YUV420P)
            rgb = mp.PixelInfo(mp.kPF_RGB24)

            frame = mp.Frame(mp.from_numpy(np.ascontiguousarray(numpy_image)),
                             rgb)
            out_frame = VideoFrame(frame).reformat(H420)

            out_frame.pts = input_frames[index_frame].pts
            out_frame.time_base = input_frames[index_frame].time_base
            output_frame_list.append(out_frame)
        return output_frame_list

    def inference(self):
        frame_num = min(self.frame_cache_.qsize(), self.in_frame_num_)
        input_frames = []

        if frame_num == 0:
            return [], []
        torch_image_array = []
        pil_image_array = []
        for i in range(frame_num):
            vf = self.frame_cache_.get()
            if (vf.frame().device() == mp.Device('cpu')):
                vf = vf.cuda()
            input_frames.append(vf)

            rgb = mp.PixelInfo(mp.kPF_RGB24)
            torch_vf = torch.from_dlpack(vf.reformat(rgb).frame().plane(0))
            numpy_vf = torch_vf.cpu().numpy()
            torch_image_array.append(torch_vf)
            pil_image_array.append(PIL.Image.fromarray(numpy_vf))

        input_tensor = self.pre_process(torch_image_array)

        for i in range(self.num_inputs_):
            self.context_.set_tensor_address(
                self.tensor_names_[i],
                int(input_tensor.contiguous().data_ptr()))

        for i in range(self.num_inputs_, self.num_io_tensors_):
            self.context_.set_tensor_address(
                self.tensor_names_[i],
                int(self.output_dict_[self.tensor_names_[i]].data_ptr()))

        self.context_.execute_async_v3(self.stream_.handle())

        scores = self.output_dict_["scores"].cpu().numpy()
        boxes = self.output_dict_["boxes"].cpu().numpy()

        detect_result_list = self.post_process(pil_image_array, boxes, scores)
        if self.label_frame_flag_ == 1:
            result_frames = self.label_frame(input_frames, pil_image_array,
                                             detect_result_list)
            return result_frames, detect_result_list

        return input_frames, detect_result_list

    def process(self, task):
        input_queue = task.get_inputs()[0]
        output_queue_0 = task.get_outputs()[0]
        output_queue_size = len(task.get_outputs())
        if output_queue_size >= 2:
            output_queue_1 = task.get_outputs()[1]

        while not input_queue.empty():
            pkt = input_queue.get()
            if pkt.timestamp == Timestamp.EOF:
                self.eof_received_ = True
            if pkt.is_(VideoFrame):
                self.frame_cache_.put(pkt.get(VideoFrame))

        while self.frame_cache_.qsize(
        ) >= self.in_frame_num_ or self.eof_received_:
            out_frames, detect_result_list = self.inference()
            for idx, frame in enumerate(out_frames):
                pkt = Packet(frame)
                pkt.timestamp = frame.pts
                output_queue_0.put(pkt)

                if (output_queue_size >= 2):
                    pkt = Packet(detect_result_list[idx])
                    pkt.timestamp = frame.pts
                    output_queue_1.put(pkt)

            if self.frame_cache_.empty():
                break

        if self.eof_received_:
            for key in task.get_outputs():
                task.get_outputs()[key].put(Packet.generate_eof_packet())
                Log.log_node(LogLevel.DEBUG, self.node_, "output stream",
                             "done")
            task.timestamp = Timestamp.DONE

        return ProcessResult.OK


## Download a face video (made from Youtube Faces Database)

In [None]:
!wget https://github.com/BabitMF/bmf/releases/download/files/files.tar.gz
!tar -zvxf files.tar.gz
!cp files/face.mp4 .

In [None]:
%env BMF_LOG_LEVEL=INFO

## Build BMF Graph pipeline

In [None]:
%%writefile /content/trt.py

import sys
import torch
import numpy as np

import bmf
from bmf import Log, LogLevel


def main():

    # v1.engine can be built by the command: trtexec --onnx=version-RFB-640.onnx --buildOnly --saveEngine=version-RFB-640.engine

    (bmf.graph().decode({
        "input_path": "./face.mp4",
        "video_params": {
            "hwaccel": "cuda",
        }
    })["video"].module("trt_face_detect", option = {
            "model_path": "version-RFB-640.engine",
            "label_to_frame": 1,
            "input_shapes": {
                "input": [1, 3, 480, 640]
            }
        }).encode(
        None, {
            "output_path": "./trt_out.mp4",
            "video_params": {
                "codec": "h264_nvenc",
                "bit_rate": 5000000,
            }
        }).run())


if __name__ == "__main__":
    main()


In [None]:
!python3 trt.py

## Display the processed video

In [None]:
from IPython.display import HTML
from base64 import b64encode

def show_video(video_path, video_width = 800):

  video_file = open(video_path, "r+b").read()

  video_url = f"data:video/mp4;base64,{b64encode(video_file).decode()}"

  return HTML(f"""<video width={video_width} controls><source src="{video_url}"></video>""")

In [None]:
# show output video
show_video("./face.mp4")

In [None]:
# show input video
show_video("./trt_out.mp4")