# Import pose estimation model

## Define output format

Let's load the JSON file, which describes the body format, a slightly modified version of the MS-COCO format (BODY18). This body format is used to create a topology tensor that associate Part Affinity Field (PAF) channels to their corresponding human body part and connect them to generate a skeleton.

Reference: [TensorRT-Pose repository](https://github.com/NVIDIA-AI-IOT/trt_pose)

In [1]:
import os
os.environ['MPLCONFIGDIR'] = os.getcwd() + "/configs/" # Specify MatplotLib config folder
import numpy as np
import json
# Requiere https://github.com/NVIDIA-AI-IOT/trt_pose
import trt_pose.coco
from trt_pose.draw_objects import DrawObjects
from trt_pose.parse_objects import ParseObjects

with open('human_pose.json', 'r') as f:
    human_pose = json.load(f)

topology = trt_pose.coco.coco_category_to_topology(human_pose)

parse_objects = ParseObjects(topology)
draw_objects = DrawObjects(topology)

## Import the pose estimation optimized model

Next, we'll load our PyTorch pose estimation model. It has been optimized using another Notebook and saved so that we do not need to perform optimization again. The optimization procedure is detailed in the TensorRT-Pose repository. Please note that TensorRT has device-specific optimizations, so you can only use an optimized model on the same platform.

Reference: [Torch2TRT repository](https://github.com/NVIDIA-AI-IOT/torch2trt)

In [2]:
import torch
from torch2trt import TRTModule

TRT_POSE_PATH = 'resnet18_baseline_att_224x224_trt_FP16_33MB.pth'

model_estimation = TRTModule()
model_estimation.load_state_dict(torch.load(TRT_POSE_PATH))

<All keys matched successfully>

## Import the pose classification model

In [3]:
import tensorflow as tf
from tensorflow.python.saved_model import tag_constants

TRT_CLASSIFICATION_PATH = './Robust_BODY18_FP32'

model_classification = tf.saved_model.load(TRT_CLASSIFICATION_PATH, tags=[tag_constants.SERVING])
infer_classification = model_classification.signatures['serving_default']

In [4]:
# Blank inference to load model
infer_classification(tf.constant(
    np.random.normal(size=(1, 18, 2)).astype(np.float32),
    dtype=tf.float32,
))

{'dense_20': <tf.Tensor: shape=(1, 20), dtype=float32, numpy=
 array([[4.0254830e-19, 1.2014065e-29, 7.3594776e-19, 5.5553655e-22,
         1.2282521e-17, 2.4289520e-25, 9.9999976e-01, 8.9502602e-27,
         2.0831374e-10, 3.7753817e-10, 2.3662060e-19, 2.4505236e-21,
         5.7214940e-08, 2.1032307e-07, 1.8473210e-26, 6.7222669e-14,
         2.5704211e-26, 1.0280534e-08, 1.4103428e-30, 3.8663297e-15]],
       dtype=float32)>}

In [5]:
with open('Robust_BODY18_Info.json') as f:
    classificationLabels = json.load(f)['labels']
print("labels:", classificationLabels)

labels: ['Seated', 'Stand', 'Stand_RightArmRaised', 'Stand_LeftArmRaised', 'T', 'MilitarySalute', 'PushUp_Low', 'Squat', 'Plank', 'Yoga_Tree_left', 'Yoga_Tree_right', 'Yoga_UpwardSalute', 'Yoga_Warrior2_left', 'Yoga_Warrior2_right', 'Traffic_AllStop', 'Traffic_BackStop', 'Traffic_FrontStop', 'Traffic_BackFrontStop', 'Traffic_LeftTurn', 'Traffic_RightTurn']


# Define video-processing pipeline

## Access video feed

The whole video acquisition pipeline, incuding scaling and cropping, is done using the accelerated GStreamer for NVidia Tegra processors.  See console for details about the video acquisition pipeline. This pipeline is used by OpenCV to access images.

References: [User Guide](https://developer.download.nvidia.com/embedded/L4T/r32_Release_v1.0/Docs/Accelerated_GStreamer_User_Guide.pdf?UliDteoP_g5QqgRwKoNbj3abiW9TeMtEWNumYbfdqeWY6oSlJaPISqf04banob6ohwLwYKvWmMUjwI8EWpk3f8lpapB3XvQGRxGPej5eiHmM_QA-AHiAenmymLlFAs1QmtZHTwE4FL_o2GYBqCc1M8ggJJcgb5w6whYwSFe9sK7rp3avYyw), [Documentation](https://docs.nvidia.com/jetson/l4t/index.html#page/Tegra%20Linux%20Driver%20Package%20Development%20Guide/accelerated_gstreamer.html)

In [6]:
camera_aspect_ratio = 16./9.
INPUT_IMG_SIZE = 224

gstream_pipeline = (
    "nvarguscamerasrc ! "
    "video/x-raw(memory:NVMM), "
    "width=(int){capture_width:d}, height=(int){capture_height:d}, "
    "format=(string)NV12, framerate=(fraction){framerate:d}/1 ! "
    "nvvidconv top={crop_top:d} bottom={crop_bottom:d} left={crop_left:d} right={crop_right:d} flip-method={flip_method:d} ! "
    "video/x-raw, width=(int){display_width:d}, height=(int){display_height:d}, format=(string)BGRx ! "
    "videoconvert ! "
    "video/x-raw, format=(string)BGR ! appsink".format(
        capture_width = int(INPUT_IMG_SIZE*camera_aspect_ratio),
        capture_height = INPUT_IMG_SIZE,
        framerate = 60,
        crop_top = 0,
        crop_bottom = INPUT_IMG_SIZE,
        crop_left = int(INPUT_IMG_SIZE*(camera_aspect_ratio-1)/2),
        crop_right = int(INPUT_IMG_SIZE*(camera_aspect_ratio+1)/2),
        flip_method = 2,
        display_width = INPUT_IMG_SIZE,
        display_height = INPUT_IMG_SIZE,
    )
)
print(gstream_pipeline)

nvarguscamerasrc ! video/x-raw(memory:NVMM), width=(int)398, height=(int)224, format=(string)NV12, framerate=(fraction)60/1 ! nvvidconv top=0 bottom=224 left=87 right=311 flip-method=2 ! video/x-raw, width=(int)224, height=(int)224, format=(string)BGRx ! videoconvert ! video/x-raw, format=(string)BGR ! appsink


In [7]:
# bufferless VideoCapture
import cv2, queue, threading, time

class VideoCapture:
    def __init__(self, stream, apiPreference):
        self.cap = cv2.VideoCapture(stream, apiPreference)
        self.q = queue.Queue()
        t = threading.Thread(target=self._reader)
        t.daemon = True
        t.start()

    # read frames as soon as they are available, keeping only most recent one
    def _reader(self):
        while True:
            ret, frame = self.cap.read()
            if not ret:
                break
            if not self.q.empty():
                try:
                    self.q.get_nowait()   # discard previous (unprocessed) frame
                except queue.Empty:
                    pass
            self.q.put((ret, frame))

    def read(self):
        return self.q.get()
    
    def release(self):
        return self.cap.release()

## Load image in the TensorRT pipeline

Next, let's define a function that will preprocess the image, which is originally in HWC/BGR8 format. It is formated, normalized and loaded in the CUDA processing pipeline.

In [8]:
import cv2
import torchvision.transforms as transforms
import PIL.Image

# Normalization values can be fine-tuned for your camera. Still, default values generally perform well.
mean = torch.Tensor([0.485, 0.456, 0.406]).cuda()
std = torch.Tensor([0.229, 0.224, 0.225]).cuda()
device = torch.device('cuda')

def preprocess(image):
    global device
    device = torch.device('cuda')
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = PIL.Image.fromarray(image)
    image = transforms.functional.to_tensor(image).to(device)
    image.sub_(mean[:, None, None]).div_(std[:, None, None])
    return image[None, ...]

## Pose estimation inference

In [9]:
def get_keypoints(counts, objects, peak, indexBody=0):
    #if indexBody<counts[0]:
    #    return None
    kpoint = []
    human = objects[0][indexBody]
    C = human.shape[0]
    for j in range(C):
        k = int(human[j])
        if k >= 0:
            peak = peaks[0][j][k]   # peak[1]:width, peak[0]:height
            kpoint.append([float(peak[1]),float(peak[0])])
        else:        
            kpoint.append([None, None])
    return np.array(kpoint)

def get_cmap_paf(image):
        data = preprocess(image)
        cmap, paf = model_estimation(data)
        cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
        return cmap, paf

## Keypoints pre-processing

In [10]:
def getLengthLimb(data, keypoint1: int, keypoint2: int):
    if type(data[keypoint1, 0]) != type(None) and type(data[keypoint2, 0]) != type(None):
        return np.linalg.norm([data[keypoint1, 0:2] - data[keypoint2, 0:2]])
    return 0

def preprocess_keypoints(keypoints:np.ndarray):
    if type(keypoints) != type(None):
        assert keypoints.shape == (18,2)
        # Find bounding box
        min_x, max_x = float("inf"), 0.0
        min_y, max_y = float("inf"), 0.0
        for k in keypoints:
            if type(k[0]) != type(None):  # If keypoint exists
                min_x = min(min_x, k[0])
                max_x = max(max_x, k[0])
                min_y = min(min_y, k[1])
                max_y = max(max_y, k[1])

        # Centering
        np.subtract(
            keypoints[:, 0],
            (min_x + max_x) / 2.,
            where=keypoints[:, 0] != None,
            out=keypoints[:, 0],
        )
        np.subtract(
            (min_y + max_y) / 2.,
            keypoints[:, 1],
            where=keypoints[:, 0] != None,
            out=keypoints[:, 1],
        )

        # Scaling  
        normalizedPartsLength = np.array(
            [
                getLengthLimb(keypoints, 6, 12) * (16.0 / 5.2),  # Torso right
                getLengthLimb(keypoints, 5, 11) * (16.0 / 5.2),  # Torso left
                getLengthLimb(keypoints, 0, 17) * (16.0 / 2.5),  # Neck
                getLengthLimb(keypoints, 12, 14) * (16.0 / 3.6),  # Right thigh
                getLengthLimb(keypoints, 14, 16) * (16.0 / 3.5),  # Right lower leg
                getLengthLimb(keypoints, 11, 13) * (16.0 / 3.6),  # Left thigh
                getLengthLimb(keypoints, 13, 15) * (16.0 / 3.5),  # Left lower leg
            ]
        )
        
        # Mean of non-zero lengths
        normalizedPartsLength = normalizedPartsLength[normalizedPartsLength > 0.0]
        if len(normalizedPartsLength)>0:
            scaleFactor = np.mean(normalizedPartsLength)
        else:
            return None

        # Populate None keypoints with 0s
        keypoints[keypoints == None] = 0.0

        # Normalize
        np.divide(keypoints, scaleFactor, out=keypoints[:, 0:2])

        if np.any((keypoints > 1.0) | (keypoints < -1.0)):
            #print("Scaling error")
            return None

        return keypoints.astype('float32')
    else: return None

import ipywidgets
from IPython.display import display
image_w = ipywidgets.Image(format='jpeg')
display(image_w)## Processing loop

- Read image
- Pre-process to Torch format
- Infere key-points
- Draw skeleton on the input image
- Update in output window.

In [13]:
import ipywidgets
from IPython.display import display
image_w = ipywidgets.Image(format='jpeg')
display(image_w)

def show_image(img, label:str=None, fps:float=None):
    if label:
        label = label.replace('_', ' ')
        img = cv2.putText(img, label, (10,25), cv2.FONT_HERSHEY_DUPLEX, .7, (0,), 2, cv2.LINE_AA)
        img = cv2.putText(img, label, (10,25), cv2.FONT_HERSHEY_DUPLEX, .7, (255,255,255), 1, cv2.LINE_AA)
    if fps:
        fps = 'FPS: {:.2f}'.format(fps)
        img = cv2.putText(img, fps, (10,img.shape[0]-10), cv2.FONT_HERSHEY_DUPLEX, .7, (0,0,0), 2, cv2.LINE_AA)
        img = cv2.putText(img, fps, (10,img.shape[0]-10), cv2.FONT_HERSHEY_DUPLEX, .7, (255,255,255), 1, cv2.LINE_AA)
    image_w.value = bytes(cv2.imencode('.jpg', img)[1])

Image(value=b'', format='jpeg')

In [15]:
from collections import deque
import time

buffer_size = 5
processing_times = deque(buffer_size*[0.], buffer_size)

cap_CSI = VideoCapture(gstream_pipeline, cv2.CAP_GSTREAMER)

try:
    while True:
        start_time = time.time()
            
        # Get image
        re, image = cap_CSI.read()
        
        if re:
            
            # TRT-Pose inference
            cmap, paf = get_cmap_paf(image) # Pose estimation inference
            counts, objects, peaks = parse_objects(cmap, paf) # Matching algorithm
            keypoints = get_keypoints(counts, objects, peaks) # BODY18 model formating

            # Classification inference
            label_pose = None
            keypoints = preprocess_keypoints(keypoints)
            if type(keypoints) != type(None):
                x = tf.constant(np.expand_dims(keypoints, axis=0), dtype=tf.float32)
                prediction = infer_classification(x)
                label_pose = classificationLabels[np.argmax(prediction['dense_20'][0])]

            # Display image locally
            draw_objects(image, counts, objects, peaks)
            show_image(image, label_pose, np.mean(processing_times))
        else:
            raise RuntimeError('Could not read image from camera')
            
        processing_times.appendleft(1./(time.time() - start_time))
            
except (KeyboardInterrupt, RuntimeError) as e:
    cap_CSI.release()
    print('Video processing stopped')

Video processing stopped
