# Pose Detection and Face Blurring

In [27]:
import numpy as np
import cv2
import torch
from torch2trt import TRTModule
import torchvision.transforms as transforms
import PIL.Image
from trt_pose.draw_objects import DrawObjects
from trt_pose.parse_objects import ParseObjects


print("Loading Pose Detection Model")
OPTIMIZED_MODEL = 'resnet18_baseline_att_224x224_A_epoch_249_trt.pth'
model_trt = TRTModule()
model_trt.load_state_dict(torch.load(OPTIMIZED_MODEL))
print("Model Loaded")

print("Loading coco topology")
def get_topology():
    import trt_pose.coco
    import json
    with open('human_pose.json', 'r') as f:
        human_pose = json.load(f)

    return trt_pose.coco.coco_category_to_topology(human_pose)

topology = get_topology()

print("Initializing pose estimation pipeline")
#Preprocessing constants
WIDTH, HEIGHT = 224, 224
mean = torch.Tensor([0.485, 0.456, 0.406]).cuda()
std = torch.Tensor([0.229, 0.224, 0.225]).cuda()
device = torch.device('cuda')

#The actual pose estimation
parse_objects = ParseObjects(topology)
draw_objects = DrawObjects(topology)
def preprocess(image):
    global device
    device = torch.device('cuda')
    
    image = cv2.resize(image, (WIDTH, HEIGHT))
    image = PIL.Image.fromarray(image)
    image = transforms.functional.to_tensor(image).to(device)
    image.sub_(mean[:, None, None]).div_(std[:, None, None])
    return image[None, ...]

def draw_pose(input_img):
    W, H, _ = input_img.shape
    cmap, paf = model_trt(preprocess(input_img))
    cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
    counts, objects, peaks = parse_objects(cmap, paf)
    
    draw_objects(input_img, counts, objects, peaks)
    
    return cv2.resize(input_img, (H, W))
print("All done!")

Loading Pose Detection Model
Model Loaded
Loading coco topology
Initializing pose estimation pipeline
All done!


In [80]:
def extract_coords(input_img, counts, objects, peaks, topology_idxs):
        
    coords = [(-1, -1)]*len(topology_idxs)
    height = input_img.shape[0]
    width = input_img.shape[1]
    for person in range(int(counts[0])):
        obj = objects[0][person]
        for i, point in enumerate(topology_idxs):
            if int(obj[point]) >= 0:
                peak = peaks[0][point][int(obj[point])]
                coords[i] = (round(float(peak[1]) * width), round(float(peak[0]) * height))
                
    return coords
                
def blur_chest(input_img, counts, objects, peaks):
    chest = extract_coords(input_img, counts, objects, peaks, [5, 6])
    if chest[0][0] != -1 and chest[1][0] != -1:
        delta_x = abs(chest[0][0] - chest[1][0])//2
        delta_y = abs(chest[0][1] - chest[1][1])//2
        x1 = min(chest[0][0], chest[1][0])
        x2 = max(chest[0][0], chest[1][0])
        y1 = min(chest[0][1], chest[1][1]) 
        y2 = max(chest[0][1], chest[1][1]) + delta_x

        input_img = blur_box(input_img, x1, y1, x2, y2)
        cv2.rectangle(input_img, (x1, y1), (x2, y2), (0, 0, 255), 2)
    return input_img
    
    
def blur_hips(input_img, counts, objects, peaks):
    hips = extract_coords(input_img, counts, objects, peaks, [11, 12])
    if hips[0][0] != -1 and hips[1][0] != -1:
        delta_x = abs(hips[0][0] - hips[1][0])//2
        delta_y = abs(hips[0][1] - hips[1][1])//2
        x1 = min(hips[0][0], hips[1][0]) - delta_x
        x2 = max(hips[0][0], hips[1][0]) + delta_x
        y1 = min(hips[0][1], hips[1][1]) - delta_x//2
        y2 = max(hips[0][1], hips[1][1]) + delta_x//2

        input_img = blur_box(input_img, x1, y1, x2, y2)
        cv2.rectangle(input_img, (x1, y1), (x2, y2), (0, 0, 255), 2)
    return input_img
        
def draw_pose(input_img):
    W, H, _ = input_img.shape
    cmap, paf = model_trt(preprocess(input_img))
    cmap, paf = cmap.detach().cpu(), paf.detach().cpu()
    counts, objects, peaks = parse_objects(cmap, paf)
    
    draw_objects(input_img, counts, objects, peaks)
    input_img = blur_hips(input_img, counts, objects, peaks)
    input_img = blur_chest(input_img, counts, objects, peaks)

    return cv2.resize(input_img, (H, W))

In [74]:
from utils.mtcnn import TrtMtcnn
mtcnn = TrtMtcnn()
def blur_box(img, x1, y1, x2, y2):
    print(x1, y1, x2, y2, img.shape)
    x1 = max(x1, 0)
    y1 = max(y1, 0)
    x2 = min(x2, img.shape[1])
    y2 = min(y2, img.shape[0])
    print(x1, y1, x2, y2, img.shape)
    ROI = img[y1:y2, x1:x2]
    
    #Pixellate
    ROI = cv2.resize(ROI, (8, 8), interpolation=cv2.INTER_LINEAR)
    img[y1:y2, x1:x2] = cv2.resize(ROI, (x2 - x1, y2 - y1), interpolation=cv2.INTER_NEAREST)
    
    #Gaussian Blur
    #img[y1:y2, x1:x2] = cv2.GaussianBlur(ROI, (51, 51), 0)
    return img

def blur_faces(img, boxes):
    for bb in boxes:
        x1, y1, x2, y2 = int(bb[0]), int(bb[1]), int(bb[2]), int(bb[3])
        img = blur_box(img, x1, y1, x2, y2)
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
        
    return img

In [81]:
#Camera streaming
import pyrealsense2 as rs

pipeline = rs.pipeline()
config = rs.config()
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)

# Create an align object
align_to = rs.stream.color
align = rs.align(align_to)

# Start streaming
pipeline.start(config)

#Stuff for timing
import time
start_time = time.time()
frame_num = 0

try:
    while True:

        # Wait for a coherent pair of frames: depth and color
        frames = pipeline.wait_for_frames()
        # Align the depth frame to color frame
        aligned_frames = align.process(frames)

        # Get aligned frames
        depth_frame = aligned_frames.get_depth_frame() # aligned_depth_frame is a 640x480 depth image
        color_frame = aligned_frames.get_color_frame()

        if not depth_frame or not color_frame:
            continue

        # Convert images to numpy arrays
        depth_image = np.asanyarray(depth_frame.get_data())
        color_image = np.asanyarray(color_frame.get_data())
        
        #Detect faces
        boxes, _ = mtcnn.detect(color_image, minsize=40)
        #Do pose estimation
        color_image = draw_pose(color_image)
        #Blur faces
        color_image = blur_faces(color_image, boxes)

        #Show image somehow
        # Apply colormap on depth image (image must be converted to 8-bit per pixel first)
        depth_colormap = cv2.applyColorMap(cv2.convertScaleAbs(depth_image, alpha=0.03), cv2.COLORMAP_JET)

        # Stack both images horizontally
        images = np.hstack((color_image, depth_colormap))
        
        # Show images
        cv2.namedWindow('RealSense', cv2.WINDOW_AUTOSIZE)
        cv2.imshow('RealSense', images)
        cv2.waitKey(1)
        
        #Timing
        frame_num += 1
        if frame_num % 100 == 0:
            print("FPS: Frame", frame_num, 100/(time.time() - start_time))
            start_time = time.time()


finally:

    # Stop streaming
    pipeline.stop()


259 314 520 388 (480, 640, 3)
259 314 520 388 (480, 640, 3)
229 48 404 169 (480, 640, 3)
229 48 404 169 (480, 640, 3)
274 323 463 371 (480, 640, 3)
274 323 463 371 (480, 640, 3)
252 94 382 160 (480, 640, 3)
252 94 382 160 (480, 640, 3)
234 118 277 149 (480, 640, 3)
234 118 277 149 (480, 640, 3)
188 0 253 76 (480, 640, 3)
188 0 253 76 (480, 640, 3)
116 338 269 376 (480, 640, 3)
116 338 269 376 (480, 640, 3)
127 155 255 219 (480, 640, 3)
127 155 255 219 (480, 640, 3)
162 20 221 96 (480, 640, 3)
162 20 221 96 (480, 640, 3)
87 340 243 383 (480, 640, 3)
87 340 243 383 (480, 640, 3)
88 167 214 237 (480, 640, 3)
88 167 214 237 (480, 640, 3)
135 37 190 110 (480, 640, 3)
135 37 190 110 (480, 640, 3)
136 343 241 376 (480, 640, 3)
136 343 241 376 (480, 640, 3)
102 164 213 225 (480, 640, 3)
102 164 213 225 (480, 640, 3)
144 45 198 116 (480, 640, 3)
144 45 198 116 (480, 640, 3)
119 336 267 373 (480, 640, 3)
119 336 267 373 (480, 640, 3)
123 168 237 231 (480, 640, 3)
123 168 237 231 (480, 640, 3)
16

KeyboardInterrupt: 