In [1]:
import torch
import torchvision.transforms as transforms
import cv2
import pandas as pd
import numpy as np
import math


from rootnet import rootnet
from config import cfg
from root_utils.pose_utils import process_bbox
from dataset import generate_patch_image



In [67]:
# prepare input image
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize(mean=cfg.pixel_mean, std=cfg.pixel_std)])
img_path = 'input/18.jpg'
original_img = cv2.imread(img_path)

# resize image
original_img = cv2.resize(original_img, (256,256), interpolation = cv2.INTER_AREA) 
original_img_height, original_img_width = original_img.shape[:2]

# YOLO

In [68]:
# Model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
model.eval()

# Inference
results = model(original_img)

Using cache found in C:\Users\abdor/.cache\torch\hub\ultralytics_yolov5_master
YOLOv5  2021-9-26 torch 1.9.0 CUDA:0 (GeForce GTX 1050 Ti, 4096.0MB)

Fusing layers... 
Model Summary: 224 layers, 7266973 parameters, 0 gradients
Adding AutoShape... 


In [69]:
# Results
res = pd.DataFrame()
res = results.pandas().xyxy[0]
res

Unnamed: 0,xmin,ymin,xmax,ymax,confidence,class,name
0,36.299999,160.900009,62.25,183.699997,0.81543,62,tv
1,103.200005,157.100006,126.200005,219.199997,0.776367,0,person
2,0.075,220.0,39.975002,256.0,0.767578,56,chair
3,133.800003,152.300003,168.800003,207.800003,0.751465,0,person
4,76.5,155.600006,96.5,175.199997,0.736328,62,tv
5,173.600006,122.400002,194.400009,186.400009,0.686035,0,person
6,23.8375,194.800003,55.049999,249.600006,0.589355,56,chair
7,69.300003,204.100006,102.900002,245.600006,0.54834,56,chair
8,0.0,168.800003,12.23125,193.199997,0.499023,62,tv
9,124.300003,191.900009,155.699997,223.0,0.49292,56,chair


In [70]:
persons = res[res['name']=='person']
persons = persons.assign(width = abs(persons['xmax'] - persons['xmin']))
persons = persons.assign(hight = abs(persons['ymax'] - persons['ymin']))
rootnet_input = persons.drop(['xmax','ymax','confidence','class','name'], axis=1)
rects = rootnet_input.values.tolist()

# RootNet

In [71]:
bbox_list = rootnet_input.values.tolist()

In [72]:
# prepare bbox for each human
#bbox_list = rootnet_input.values.tolist()  # xmin, ymin, width, height
person_num = len(bbox_list)

In [114]:
# normalized camera intrinsics
focal = [53, 53] # x-axis, y-axis
princpt = [original_img_width/2, original_img_height/2] # x-axis, y-axis
print('focal length: (' + str(focal[0]) + ', ' + str(focal[1]) + ')')
print('principal points: (' + str(princpt[0]) + ', ' + str(princpt[1]) + ')')

focal length: (53, 53)
principal points: (128.0, 128.0)


In [115]:
model = rootnet.lood_model('./snapshot_%d.pth.tar' % int(18))

Load checkpoint from ./snapshot_18.pth.tar


In [116]:
result=[]
roots= [] 
# for cropped and resized human image, forward it to RootNet
for n in range(person_num):
    bbox = process_bbox(np.array(bbox_list[n]), original_img_width, original_img_height)
    img, img2bb_trans = generate_patch_image(original_img, bbox, False, 0.0) 
    img = transform(img).cuda()[None,:,:,:]
    k_value = np.array([math.sqrt(cfg.bbox_real[0]*cfg.bbox_real[1]*focal[0]*focal[1]/(bbox[2]*bbox[3]))]).astype(np.float32)
    k_value = torch.FloatTensor([k_value]).cuda()[None,:]

    # forward
    with torch.no_grad():
        root_3d = model(img, k_value) # x,y: pixel, z: root-relative depth (mm)
    img = img[0].cpu().numpy()
    root_3d = root_3d[0].cpu().numpy()
    result.append(np.array([root_3d[0],root_3d[1],root_3d[2]]))

In [117]:
result

[array([       31.5,       40.66,      1005.3], dtype=float32),
 array([     27.882,      39.178,      916.41], dtype=float32),
 array([     32.316,      39.033,      1117.1], dtype=float32)]

In [118]:
def calc_distance(p0,p1):
    squared_dist = np.sum((p0-p1)**2)
    dist = np.sqrt(squared_dist)
    return dist

In [119]:
distances = []
for e in result:
    distances.append([calc_distance(e,num) for num in result])

print(distances)

[[0.0, 88.946976, 111.87162], [88.946976, 0.0, 200.76688], [111.87162, 200.76688, 0.0]]


In [120]:
def distance_limit(mylist,dis):
    flag =False
    for i in mylist:
        if i>0:
            if i<= dis:
                flag = True
    return flag

distance_flags = list(map( lambda x: distance_limit(x,100)  ,distances))

In [121]:
distance_flags

[True, True, False]

In [122]:
#cv2.destroyAllWindows()
image = original_img.copy()
cv2.namedWindow("output2", cv2.WINDOW_NORMAL)    # Create window with freedom of dimensions
#  BGR
for i,flag in zip(range(len(rects)),distance_flags):
    print(flag)
    if flag:
        image = cv2.rectangle(image, (int(rects[i][0]), int(rects[i][1])), (int(rects[i][0]+rects[i][2]), int(rects[i][1]+rects[i][3])), (0, 0, 255), 1)
    else:
        image = cv2.rectangle(image, (int(rects[i][0]), int(rects[i][1])), (int(rects[i][0]+rects[i][2]), int(rects[i][1]+rects[i][3])), (0, 255, 0), 1)


    #ax.add_patch(rect)
cv2.imshow("output2",image)

cv2.waitKey()

True
True
False


-1