In [44]:
from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
from torchvision import transforms
import torch
import cv2
import numpy as np
import math
import warnings

warnings.filterwarnings("ignore")

In [45]:
# Используем модель предложенную в задании, так как она хорошо работает
model = keypointrcnn_resnet50_fpn(weights=KeypointRCNN_ResNet50_FPN_Weights.DEFAULT)
model.to('cuda')

KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.

In [46]:
def draw_keypoints_and_limbs_for_one_person(img: np.ndarray, keypoints: list) -> np.ndarray:
    """Function drawing keypoints and limbs to target image

    Args:
        img (np.ndarray): target image
        keypoints (list): keypoints

    Returns:
        np.ndarray: output image
    """
    # создаём копию изображений
    img_copy = img.copy()
    point_color = (0, 255, 0)
    limb_color = (0, 0, 255)
    limbs = [[2, 0], [2, 4], [1, 0], [1, 3], [6, 8], [8, 10], [5, 7], [7, 9], [12, 14], 
             [14, 16], [11, 13], [13, 15], [6, 5], [12, 11], [6, 12], [5, 11]]
    for keypoint in keypoints:
        # рисуем кружок радиуса 5 вокруг точки
        cv2.circle(img_copy, tuple(keypoint), 5, point_color, -1)
    for limb in limbs:
        point0 = tuple(keypoints[limb[0]])
        point1 = tuple(keypoints[limb[1]])
        cv2.line(img_copy, point0, point1, limb_color, 2)
    return img_copy

In [47]:
# функции добавляют и убирают 1 к массиву для расчета аффинного преобразования
pad = lambda x: np.hstack([x, np.ones((x.shape[0], 1))])
unpad = lambda x: x[:, :-1]


def affine_transform(points1: np.ndarray, points2: np.ndarray) -> np.ndarray:
    """Affine transform points2 to points1

    Args:
        points1 (np.ndarray): ethalon points
        points2 (np.ndarray): target points

    Returns:
        np.ndarray: output points
    """
    Y = pad(points1)
    X = pad(points2)
    A, _, _, _ = np.linalg.lstsq(X, Y)
    A[np.abs(A) < 1e-10] = 0
    transform = lambda x: unpad(np.dot(pad(x), A))
    points2_1 = transform(points2)
    return points2_1


def get_similarity(points1: np.ndarray, points2: np.ndarray) -> float:
    """Get cosine similarity of two poses

    Args:
        points1 (np.ndarray): point of first pose
        points2 (np.ndarray): points of second pose

    Returns:
        float: cosine similarity
    """
    points2 = affine_transform(points1, points2)
    sim = torch.nn.functional.cosine_similarity(torch.Tensor(points1), torch.Tensor(points2))
    return sim.mean().item()

In [48]:
def weight_distance(pose1: np.ndarray, pose2: np.ndarray, conf1: float) -> float:
    """Get weighted distance of two poses

    Args:
        pose1 (np.ndarray): point of first pose
        pose2 (np.ndarray): points of second pose
        conf1 (float): confidence of predicted points of first pose

    Returns:
        float: weighted distance
    """
    pose2 = affine_transform(pose1, pose2)
    sum1 = 1 / np.sum(conf1)
    sum2 = 0
    for i in range(len(pose1)):
        sum2 += conf1[i] * abs(math.hypot(pose1[i][0] - pose2[i][0], pose1[i][1] - pose2[i][1]))
    weighted_dist = sum1 * sum2

    return weighted_dist

In [None]:
def compare_two_images(im0: np.ndarray, im1: np.ndarray, thres: float=0.8) -> [list, float, float]:
    """compare poses on two images

    Args:
        im0 (np.ndarray): first image
        im1 (np.ndarray): second image
        thres (float, optional): threshold for keypoints confidence. Defaults to 0.8.

    Returns:
        [list, float, float]: retutns prediction from keypoint detection model, cosine symilarity
        and weighted distance between predicted poses on both images
    """
    trans = transforms.Compose([transforms.ToTensor()])
    tensor_im0 = trans(im0.copy()).cuda()
    tensor_im1 = trans(im1.copy()).cuda()
    model.eval()
    res = model([tensor_im0, tensor_im1])
    score0 = res[0]['scores'][0]
    score1 = res[1]['scores'][0]
    if score0 >= thres and score1 >= thres:
        points0 = res[0]['keypoints'][0].cpu().detach().numpy()[:, :-1]
        points1 = res[1]['keypoints'][0].cpu().detach().numpy()[:, :-1]
        sim = get_similarity(points0, points1)
        conf0 = res[0]['keypoints_scores'][0].to('cpu').detach().numpy()
        wd = weight_distance(points0, points1, conf0)
    else:
        sim = 0, 
        wd = 100
    return res, sim, wd

In [51]:
def compare_two_videos(model: any, source0: str, source1: str, sim_low_thres: float=0.99, 
                     sim_high_thres: float=0.999, wd_low_thres: float=10, 
                     wd_high_thres: float=30, show_video: bool=False) -> None:
    """compare two videos by each frame a save videofile with results

    Args:
        model (any): keypoints prediction vodel
        source0 (str): source of first video
        source1 (str): source of second video
        sim_low_thres (float, optional): similarity lower threshold accept not 
        similar poses. Defaults to 0.99.
        sim_high_thres (float, optional): lower threshold accept highly similar 
        poses. Defaults to 0.999.
        wd_low_thres (float, optional): weighted distance lower threshold accept 
        highly similar poses. Defaults to 10.
        wd_high_thres (float, optional): weighted distance lower threshold accept 
        not similar poses. Defaults to 30.
        show_video (bool, optional): show video wihle processing flag. Defaults to False.
    """
    name0 = source0.split('/')[-1].split('.')[0]
    name1 = source1.split('/')[-1].split('.')[0]
    vcap0 = cv2.VideoCapture(source0)
    size = (int(vcap0.get(3)), int(vcap0.get(4)))
    total_size = (size[0] * 2, size[1])
    vcap1 = cv2.VideoCapture(source1)
    out = cv2.VideoWriter(f'{name0}-{name1}.avi', cv2.VideoWriter_fourcc('M','J','P','G'), 24, total_size)
    ok_flag = True
    model.eval()
    i = 0
    sim_total = 0
    wd_total = 0
    while ok_flag:
        ret0, frame0 = vcap0.read()
        ret1, frame1 = vcap1.read()
        if not ret0 or not ret1:
            print("Frame is empty")
            break
        else:
            frame1 = cv2.resize(frame1, size)
            res, sim, wd = compare_two_images(frame0, frame1)
            sim_text = f"similarity: {round(sim, 6)}"
            wd_text =  f"WD: {round(wd, 2)}"
            points0 = res[0]['keypoints'].detach().to('cpu')[0][:, :2].int().tolist()
            points1 = res[1]['keypoints'].detach().to('cpu')[0][:, :2].int().tolist()
            frame0 = draw_keypoints_and_limbs_for_one_person(frame0, points0)
            frame1 = draw_keypoints_and_limbs_for_one_person(frame1, points1)
            if sim > sim_high_thres:
                sim_color = (0, 255, 0)
            elif sim > sim_low_thres and sim <= sim_high_thres:
                sim_color = (0, 255, 255)
            else:
                sim_color = (0, 0, 255)
            if wd < wd_low_thres:
                wd_color = (0, 255, 0)
            elif wd < wd_high_thres and wd >= wd_low_thres:
                wd_color = (0, 255, 255)
            else:
                wd_color = (0, 0, 255)
            frame0 = cv2.putText(frame0, 'Teacher', (250, 930), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            frame1 = cv2.putText(frame1, sim_text, (30, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, sim_color, 2)
            frame1 = cv2.putText(frame1, wd_text, (200, 60), cv2.FONT_HERSHEY_SIMPLEX, 0.5, wd_color, 2)
            frame1 = cv2.putText(frame1, 'Studient', (250, 930), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
            frame = np.concatenate([frame0, frame1], axis=1)
            out.write(frame)
            if show_video:
                cv2.imshow('VIDEO', frame)
            if cv2.waitKey(1) == 27:
                ok_flag = False
            sim_total += sim
            wd_total += wd
        i += 1
    sim_total /= i
    wd_total /= i
    last_frame = np.zeros((total_size[1], total_size[0], 3)).astype('uint8')
    text = f'Your result is: similarity - {round(sim_total, 6)}, WD - {round(wd_total, 2)}'
    last_frame = cv2.putText(last_frame, text, (100, 480), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 4)
    for i in range(30):
        out.write(last_frame)
    out.release()
    if show_video:
        cv2.destroyAllWindows()

In [52]:
source0 = '../../myproject_data/2.mp4'
source1 = '../../myproject_data/6.mp4'
compare_two_videos(model, source0, source1)

Frame is empty


In [53]:
source0 = '../../myproject_data/2.mp4'
source1 = '../../myproject_data/4.mp4'
compare_two_videos(model, source0, source1)

Frame is empty
