In [2]:
import torch
import os
from models.experimental import attempt_load
import cv2
from utils.datasets import letterbox
from torchvision import transforms
import numpy as np
import time
from utils.general import non_max_suppression_kpt,strip_optimizer,xyxy2xywh
from utils.plots import output_to_keypoint, plot_skeleton_kpts,colors,plot_one_box_kpt
import logging
import pandas as pd
import itertools


In [3]:
device ='cuda' if torch.cuda.is_available() else 'cpu'
model_path = '../models/yolov7-w6-pose.pt'

In [4]:
def strip_model(device, model_path):
    x = torch.load(model_path, map_location=torch.device(device))

    if x.get('ema'):
        x['model'] = x['ema']  # replace model with ema
    for k in 'optimizer', 'training_results', 'wandb_id', 'ema', 'updates':  # keys
        x[k] = None
    x['epoch'] = -1
    if device!='cpu':
        x['model'].half()  # to FP16
    else:
        x['model'].float()
    for p in x['model'].parameters():
        p.requires_grad = False

    torch.save(x, model_path)
    mb = os.path.getsize(model_path) / 1E6  # filesize
    print(f"Optimizer stripped from {model_path},{(f' saved as {model_path},') if model_path else ''} {mb:.1f}MB")

In [5]:
strip_model(device, model_path)
model = attempt_load(model_path, map_location=device)

Optimizer stripped from ../models/yolov7-w6-pose.pt, saved as ../models/yolov7-w6-pose.pt, 161.1MB
Fusing layers... 


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [6]:
_ = model.eval()
names = model.module.names if hasattr(model, 'module') else model.names  # get class names
print(f"Things to identify: {names}")

Things to identify: ['person']


To open Video capture:
 - integer that represents a webcam
 - path to a video file

In [7]:
def load_source(source):
    if source.isnumeric() :    
        cap = cv2.VideoCapture(int(source))    #pass video to videocapture object
    else :
        cap = cv2.VideoCapture(source)    #pass video to videocapture object
    if cap.isOpened() == False:   #check if videocapture not opened
        print('Source not found. Check path')
    else:
        frame_width = int(cap.get(3))  #get video frame width
        frame_height = int(cap.get(4)) #get video frame height
    return cap

### Iterating over frames
For the process of retrieving sequences of landmarks, we will have a `sequence_length` which is the amount of frames taken into consideration for a single sequence of landmark. And also we will have a `separation` which is the amount of frames *ignored* between one set of landmarks and another inside one sequence of landmarks.

We will capture landmarks every `separation` and from those captured we will create `N` arrays of length `sequence_length` containing those landmarks 

### How will we identify the same person in every iteration
First, identify the object with more landmarks identified and store that set of landmarks in `base_landmarks`. Then considering the distance between `base_landmarks` and the other objects identified in the next objects identified

In [8]:
"""
Sequence row:
    Video Timestamp Set of landmarks 
"""


def landmarks_sequence_for_video(video_path, sequence_length=10, separation=6):
    """
        Args
        Returns
    """
    sequence_length = sequence_length
    separation = separation

    count = 0

    sequences = [[]]

    cap = load_source(video_path)
    frame_width = int(cap.get(3)) 
    frame_height = int(cap.get(4))
    
    base_landmarks = None
    
    start = time.time()
    current_frame = 0
    video_name = video_path.split('/')[-1]
    current_group = 1
    while(cap.isOpened): 
        ret, frame = cap.read()

        if not ret:        
            break

        count += 1

        orig_image = frame #store frame
        image = cv2.cvtColor(orig_image, cv2.COLOR_BGR2RGB) #convert frame to RGB
        image = letterbox(image, (frame_width), stride=64, auto=True)[0]
        image_ = image.copy()
        image = transforms.ToTensor()(image)
        image = torch.tensor(np.array([image.numpy()]))

        image = image.to(device)  #convert image data to device
        image = image.float() #convert image to float precision (cpu)      


        image = image.cpu().squeeze().numpy().transpose((1, 2, 0))
        
        # this frame size works with yolov7, since we don't want to touch their model, we just resize the frame.
        desired_width = 640
        desired_height = 512
        image = cv2.resize(image, (desired_width, desired_height), interpolation=cv2.INTER_LINEAR)
        image = image[:desired_height, :desired_width]

        # Convert the cropped image back to a torch.Tensor
        image = torch.from_numpy(image.transpose((2, 0, 1))).unsqueeze(0).cuda()  

        with torch.no_grad():  #get predictions
            output_data, _ = model(image)

        output_data = non_max_suppression_kpt(output_data,   #Apply non max suppression
                                        0.25, # Conf. Threshold.
                                        0.65, # IoU Threshold.
                                        nc=model.yaml['nc'], # Number of classes.
                                        nkpt=model.yaml['nkpt'], # Number of keypoints.
                                        kpt_label=True)

        output = output_to_keypoint(output_data)

        im0 = image[0].permute(1, 2, 0) * 255 # Change format [b, c, h, w] to [h, w, c] for displaying the image.
        im0 = im0.cpu().numpy().astype(np.uint8)

        im0 = cv2.cvtColor(im0, cv2.COLOR_RGB2BGR) #reshape image format to (BGR)
        gn = torch.tensor(im0.shape)[[1, 0, 1, 0]]  # normalization gain whwh

        for i, pose in enumerate(output_data):  # detections per image   
            if len(output_data):  #check if no pose
                for c in pose[:, 5].unique(): # Print results
                    n = (pose[:, 5] == c).sum()  # detections per class

                for det_index, (*xyxy, conf, cls) in enumerate(reversed(pose[:,:6])): #loop over poses for drawing on frame
                    c = int(cls)  # integer class
                    kpts = pose[det_index, 6:]
                    label = None # if opt.hide_labels else (names[c] if opt.hide_conf else f'{names[c]} {conf:.2f}')
                    plot_one_box_kpt(xyxy, im0, label=label, color=colors(c, True), 
                                    line_thickness=3,kpt_label=True, kpts=kpts, steps=3, 
                                    orig_shape=im0.shape[:2])


        if count == separation:
            # cv2.imshow("YOLOv7 Pose Estimation Demo", im0)

            if len(sequences[-1]) >= sequence_length:
                sequences += [[]] # init new empty sequence
                current_group += 1
            else:
                # TODO: make sure that the landmarks stored are the desired ones
                # Use position difference.
                if len(output):
                    sequences[-1] += [[video_name, current_group, current_frame, output[0]]]

            count = 0
            

        # Press Q on keyboard to  exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break
        current_frame += 1

    cap.release()
    print(f"\tfinished after {round(time.time() - start)}s")
    return sequences

In [12]:
# landmark retrieval
videos = ['../assets/dataset_videos/' + v for v in os.listdir('../assets/dataset_videos') if v.endswith('.mp4')]
dataset = []
print(videos)

['../assets/dataset_videos/video56.mp4', '../assets/dataset_videos/video30.mp4', '../assets/dataset_videos/video36.mp4', '../assets/dataset_videos/video24.mp4', '../assets/dataset_videos/video37.mp4', '../assets/dataset_videos/video47.mp4', '../assets/dataset_videos/video35.mp4', '../assets/dataset_videos/video31.mp4', '../assets/dataset_videos/video32.mp4', '../assets/dataset_videos/video21.mp4', '../assets/dataset_videos/video26.mp4', '../assets/dataset_videos/video40.mp4', '../assets/dataset_videos/video54.mp4', '../assets/dataset_videos/video20.mp4', '../assets/dataset_videos/video53.mp4', '../assets/dataset_videos/video55.mp4', '../assets/dataset_videos/video50.mp4', '../assets/dataset_videos/video44.mp4', '../assets/dataset_videos/video25.mp4', '../assets/dataset_videos/video41.mp4', '../assets/dataset_videos/video51.mp4', '../assets/dataset_videos/video19.mp4', '../assets/dataset_videos/video38.mp4', '../assets/dataset_videos/video34.mp4', '../assets/dataset_videos/video48.mp4',

In [13]:

for i, v in enumerate(videos):
    print(f"Processing video #{i} at '{v}'")
    sequences = landmarks_sequence_for_video(v)
    dataset += [sequences]


Processing video #0 at '../dataset_videos/video3.mp4'
	finished after 19s
Processing video #1 at '../dataset_videos/video11.mp4'
	finished after 16s
Processing video #2 at '../dataset_videos/video5.mp4'
	finished after 46s
Processing video #3 at '../dataset_videos/video14.mp4'
	finished after 14s
Processing video #4 at '../dataset_videos/video12.mp4'
	finished after 31s
Processing video #5 at '../dataset_videos/video4.mp4'
	finished after 27s
Processing video #6 at '../dataset_videos/video2.mp4'
	finished after 19s
Processing video #7 at '../dataset_videos/video9.mp4'
	finished after 17s
Processing video #8 at '../dataset_videos/video7.mp4'
	finished after 12s
Processing video #9 at '../dataset_videos/video13.mp4'
	finished after 10s
Processing video #10 at '../dataset_videos/video15.mp4'
	finished after 51s
Processing video #11 at '../dataset_videos/video1.mp4'
	finished after 18s
Processing video #12 at '../dataset_videos/video10.mp4'
	finished after 33s
Processing video #13 at '../d

In [15]:
# this merges every set of landmarks. 
flatten_data = list(itertools.chain.from_iterable(dataset))
flatten_data = list(itertools.chain.from_iterable(flatten_data))

In [16]:
df = pd.DataFrame(columns='video group frame landmarks'.split(), data=flatten_data)

In [17]:
df.head()

Unnamed: 0,video,group,frame,landmarks
0,video3.mp4,1,5,"[0.0, 0.0, 243.16107177734375, 252.64248657226..."
1,video3.mp4,1,11,"[0.0, 0.0, 244.4012451171875, 251.631011962890..."
2,video3.mp4,1,17,"[0.0, 0.0, 242.58554077148438, 252.24295043945..."
3,video3.mp4,1,23,"[0.0, 0.0, 237.6522216796875, 252.982452392578..."
4,video3.mp4,1,29,"[0.0, 0.0, 224.66685485839844, 254.49746704101..."


In [18]:
df.to_csv('../assets/extracted_landmarks.csv', index=False)

In [19]:
df[df['video'] == 'video3.mp4']

Unnamed: 0,video,group,frame,landmarks
0,video3.mp4,1,5,"[0.0, 0.0, 243.16107177734375, 252.64248657226..."
1,video3.mp4,1,11,"[0.0, 0.0, 244.4012451171875, 251.631011962890..."
2,video3.mp4,1,17,"[0.0, 0.0, 242.58554077148438, 252.24295043945..."
3,video3.mp4,1,23,"[0.0, 0.0, 237.6522216796875, 252.982452392578..."
4,video3.mp4,1,29,"[0.0, 0.0, 224.66685485839844, 254.49746704101..."
5,video3.mp4,1,35,"[0.0, 0.0, 214.17312622070312, 251.88236999511..."
6,video3.mp4,1,41,"[0.0, 0.0, 205.13592529296875, 252.50271606445..."
7,video3.mp4,1,47,"[0.0, 0.0, 203.34231567382812, 252.56079101562..."
8,video3.mp4,1,53,"[0.0, 0.0, 207.150634765625, 253.4675903320312..."
9,video3.mp4,1,59,"[0.0, 0.0, 221.7003173828125, 254.099411010742..."
