In [3]:
import json
import numpy as np
import os
import glob
from torchvision.datasets.video_utils import VideoClips, unfold
import torch
from torchvision.io import read_video
from torchvision.io.video import read_video_timestamps
import math

In [4]:
video_dir = glob.glob('/mnt1/arnav/pose-estimation/oops-videos/*.mp4')
results_dir = '/mnt1/arnav/pose-estimation/results3/'
video_cache_path = '/mnt1/arnav/oops/r3d_feats_25fps'
path_to_features = '/mnt1/arnav/oops/i3d_25fps_feats/rgb_feats.npy' # to verify shape
dict_features = np.load(path_to_features, encoding='bytes', allow_pickle=True)
mode = 'train'
fails_data = '/mnt/arnav/oops_dataset/annotations/transition_times.json'
fails_path = '/mnt/arnav/oops_dataset/oops_video/'
video_mapper_path = '/mnt1/arnav/pose-estimation/mapper.json'
fails_path = os.path.join(fails_path, mode)
target_fps = 25
step_between_clips_sec = 0.64
frames_per_clip = 16

In [5]:
with open(video_mapper_path,'r') as f:
    video_mapper = json.load(f)
    reverse_video_mapper = dict((x,y) for y,x in video_mapper.items())
video_list = glob.glob(os.path.join(f"{fails_path}", '**', '*.mp4'), recursive=True)
fails_data = json.load(open(fails_data))
video_list = [vid for vid in video_list if os.path.splitext(os.path.basename(vid))[0] in fails_data]
new_oops_vidnames = set(np.load('../../Oops-localization/videoname.npy', allow_pickle=True).tolist())

In [6]:
def get_precomputed_metadata(video_list,cache_path, mode):
    cache_file = f"{mode}_oops_clips.pth"
    cache_path = os.path.join(cache_path,cache_file)
    print(f"Cache dataset: True, Cache path: {cache_path}")
    ## load decoded clips from cache in case it is present
    precomputed_metadata = torch.load(cache_path)
    precomputed_metadata["video_paths"] = video_list
    return precomputed_metadata, cache_path

In [7]:
precomputed_metadata, cache_path = get_precomputed_metadata(video_list, video_cache_path,mode)
step_between_clips = round(step_between_clips_sec * target_fps)
video_clips = VideoClips(video_list, frames_per_clip, step_between_clips, target_fps, _precomputed_metadata = precomputed_metadata)
video_clips.compute_clips(frames_per_clip, step_between_clips, target_fps)


Cache dataset: True, Cache path: /mnt1/arnav/oops/r3d_feats_25fps/train_oops_clips.pth


In [8]:
BODY_25_MAP = {
    "Nose": 0,
    "Neck": 1,
    "RShoulder": 2,
    "RElbow": 3,
    "RWrist": 4,
    "LShoulder": 5,
    "LElbow": 6,
    "LWrist": 7,
    "Midhip": 8,
    "RHip": 9,
    "RKnee": 10,
    "RAnkle": 11,
    "LHip": 12,
    "LKnee": 13,
    "LAnkle": 14,
    "REye": 15,
    "LEye": 16,
    "REar": 17,
    "LEar": 18,
    "LBigToe": 19,
    "LSmallToe": 20,
    "LHeel": 21,
    "RBigToe": 22,
    "RSmallToe": 23,
    "RHeel": 24,
    "Background":25
}

BODY_25_MAP = {
    "Nose": 0,
    "Neck": 1,
    "RShoulder": 2,
    "RElbow": 3,
    "RWrist": 4,
    "LShoulder": 5,
    "LElbow": 6,
    "LWrist": 7,
    "RHip": 8,
    "RKnee": 9,
    "RAnkle": 10,
    "LHip": 11,
    "LKnee": 12,
    "LAnkle": 13,
    "REye": 14,
    "LEye": 15,
    "REar": 16,
    "LEar": 17,
    "LBigToe": 19,
    "LSmallToe": 20,
    "LHeel": 21,
    "RBigToe": 22,
    "RSmallToe": 23,
    "RHeel": 24,
    "Background":18
}

connections = [('Neck','LShoulder'), 
               ('Neck','RShoulder'),
               ('LShoulder','LElbow'),
               ('RShoulder','RElbow'),
               ('LElbow','LWrist'),
               ('RElbow','RWrist'),
               ('Neck','LHip'),
               ('Neck','RHip'),
               ('LHip','LKnee'),
               ('RHip','RKnee'),
               ('LKnee','LAnkle'),
               ('RKnee','RAnkle')]

In [9]:
def vectorize(pose, mode='body_25'):
    if mode == 'body_25':
        mapper = BODY_25_MAP
    elif mode == 'coco':
        mapper = COCO_MAP
    pose_vector = []
    for connection in connections:
        c1, c2 = pose[mapper[connection[0]]], pose[mapper[connection[1]]]
        if (c1[0]==0.0 and c1[1]==0.0) or (c2[0]==0.0 and c2[1]==0.0):
            pose_vector.append(np.array([0,0]))
        else:
            den = np.sqrt( np.square(c2[0]-c1[0]) + np.square(c2[1]-c1[1]) )
            num = np.array([c2[0]-c1[0],c2[1]-c1[1]])
            pose_vector.append(num/den)
    return np.array(pose_vector)

In [10]:
def reshape_poses(poses, mode='body_25'):
    poses = [vectorize(np.array(pose), mode=mode) for pose in poses] ## iterating over number of frames
    return np.array(poses)
    

In [11]:
def pose2vec(track_dict, top_k=1, mode='body_25'):
    sorted_tracks = sorted(list(track_dict.items()), key=lambda x: -x[1]['hits'])[:top_k]
    poses = [reshape_poses(sorted_tracks[i][1]['poses']) for i in range(len(sorted_tracks))] ## Iterating over number of people
    poses = np.array(poses) # num_tracks, num_frames, num_connections, 2
    poses = poses.reshape(poses.shape[0],poses.shape[1],-1)
    poses = np.pad(poses, ((0,top_k-poses.shape[0]), (0,0), (0,0)), 'constant', constant_values=((0,0),(0,0),(0,0)))
    return poses


In [12]:
pose_feature_dict = {}
for video_idx, vid_clips in enumerate(video_clips.clips):
    video_path = video_clips.video_paths[video_idx]
    video_fps = video_clips.video_fps[video_idx]
    video_pts = video_clips.video_pts[video_idx]
    videoname = os.path.splitext(os.path.basename(video_path))[0]
    if videoname in new_oops_vidnames:
        mapped_vidname = reverse_video_mapper[videoname]
        total_frames = int(math.floor(len(video_pts) * (float(target_fps) / video_fps)))
        idxs = VideoClips._resample_video_idx(total_frames, video_fps, target_fps)
        with open(os.path.join(results_dir,mapped_vidname,'track_dict.json'),'r') as f:
            vec = pose2vec(json.load(f), top_k=2,mode='coco')
            vec = vec.reshape(vec.shape[1],-1)
            # print(vec.shape)
            vec = vec[idxs]
            new_vec = torch.tensor(vec).unfold(0,frames_per_clip, step_between_clips)
            new_vec = new_vec.reshape(new_vec.shape[0],-1).detach().cpu().numpy()
            assert(new_vec.shape[0] == dict_features.item()[videoname].shape[0])
            pose_feature_dict[videoname]=new_vec

np.save(f'/mnt1/arnav/oops/pose_25fps_feats/{mode}_coco_feats.npy',pose_feature_dict)
        
        