# Ego4D Annotation Visualizations
This notebook has visualizations for Ego4D's different annotation types

## Prerequisites
1. Use the [Ego4D CLI](https://ego4d-data.org/docs/start-here/) to download the annotations and full_scale datasets. This notebook expects all videos to be downloaded, but you can manually download them as you go.
2. Install all the packages in this notebook using `requirements.txt`.

## **Useful Links:**

[Ego4D Docs - Start Here!](https://ego4d-data.org/docs/start-here/#Download-The-CLI)

[Data Overview](https://ego4d-data.org/docs/data-overview/)

[Official Ego4D Site](https://ego4d-data.org/)

# Setup

In [None]:
%reload_ext autoreload
%autoreload 2

import os
# Set your options here
# Sampled Videos will be downloaded to <version>/full_scale/ if they aren't already there

CLI_OUTPUT_DIR = "/misc/lmbraid19/argusm/CLUSTER/ego4d_data" # Replace with the full path to the --output_directory you pass to the cli
VERSION = "v2"
MANIFEST_PATH = os.path.join(CLI_OUTPUT_DIR, 'manifest.csv') # Use this if manifest is at <version>/manifest.csv
# MANIFEST_PATH = os.path.join(CLI_OUTPUT_DIR, VERSION, 'full_scale' ,'manifest.csv') # Use this if manifest is at <version>/full_scale/

assert os.path.exists(MANIFEST_PATH), f"Manifest doesn't exist at {MANIFEST_PATH}. Is the CLI_OUTPUT_DIR right? Do you satisfy the pre-requisites?"

In [None]:
#!pip install -r requirements.txt

In [None]:
# Setup
import sys
if os.path.abspath(".") not in sys.path: # Allow us to use util files in the same dir
    sys.path.insert(0, os.path.abspath("."))

import av
import collections
import csv
import cv2
import functools
import json
import logging
import math
import matplotlib.collections as mc
import matplotlib.image as mpimg
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

import numpy as np
import pandas as pd
import random
import uuid
import warnings

from celluloid import Camera
from IPython.display import HTML
from iopath.common.file_io import PathManager
from itertools import groupby
from pprint import pprint
from nb_video_utils import _get_frames, _get_frames2

%matplotlib inline
plt.rcParams["animation.html"] = "jshtml"

pathmgr = PathManager()

warnings.filterwarnings('ignore')

def vid_df_des(df):
    return f"#{len(df)} {df.duration_sec.sum()/60/60:.1f}h"
def vid_des(videos):
    return f"#{len(videos)} {sum((x.duration_sec for x in videos))/60/60:.1f}h"
def deserialize_str_list(list_: str):
    list_ = list_[1:-2]
    items = list_.split("', '")
    return list(map(lambda z: z.strip("'"), items))
def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

# Video API


In [None]:
videos_df = pd.read_csv(MANIFEST_PATH)
videos_df['scenarios'] = videos_df['scenarios'].apply(deserialize_str_list)
def get_video(video_uid='353ae622-c322-443e-95b4-e9927dedfa1c'):
    return videos_df.loc[videos_df['video_uid'] == video_uid].iloc[0]
print(f"R1 Videos: {vid_df_des(videos_df)}")

# Visualization Utils

In [None]:
# in: video_path, frame_number, boxes: [{ object_type, bbox: {x, y, width, height} }]}, draw_labels
# out: path to image of bboxes rendered onto the video frame

def get_frames_from_video(video_path, frame_meta, format="rgb24"):
    frame_number_list = [f['frame_number'] for f in frame_meta]
    frame_bboxes_list = [f['boxes'] for f in frame_meta]
    with av.open(str(video_path)) as input_video:
        for frame in _get_frames2(frame_number_list, input_video, include_audio=False, audio_buffer_frames=0):
            yield frame.to_ndarray(format=format)

def render_frames_with_bboxes(video_path, frame_meta, draw_bboxes=True, draw_labels=True, draw_arrows={}):
    colormap = { # Custom colors for FHO annotations
        'object_of_change': (0, 255, 255),
        'left_hand': (0, 0, 255),
        'right_hand': (0, 255, 0)
    }
    defaultColor = (255, 255, 0)
    rect_thickness = 1
    rectLineType = cv2.LINE_4
    fontColor = (0, 0, 0)
    fontFace = cv2.FONT_HERSHEY_DUPLEX
    fontScale = 1
    fontThickness = 1
    
    frame_number_list = [f['frame_number'] for f in frame_meta]
    frame_bboxes_list = [f['boxes'] for f in frame_meta]
    with av.open(str(video_path)) as input_video:
        frames = list(_get_frames2(frame_number_list, input_video, include_audio=False, audio_buffer_frames=0))
        paths = []
        for frame, frame_number, boxes in zip(frames, frame_number_list, frame_bboxes_list):
            img = frame.to_ndarray(format="bgr24")
            if draw_bboxes:
                for box in boxes:
                    label, bbox = box['object_type'], box['bbox']
                    rectColor = colormap.get(label, defaultColor) if label else defaultColor
                    x, y, width, height = list(map(lambda x: int(x), [bbox['x'], bbox['y'], bbox['width'], bbox['height']]))
                    cv2.rectangle(img, pt1=(x,y), pt2=(x+width, y+height), color=rectColor, thickness=rect_thickness, lineType=rectLineType)
                    if label and draw_labels:
                        textSize, baseline = cv2.getTextSize(label, fontFace, fontScale, fontThickness)
                        textWidth, textHeight = textSize
                        cv2.rectangle(img, pt1=(x - rect_thickness//2, y - rect_thickness//2), pt2=(x + textWidth + 10 + rect_thickness, y - textHeight - 10 - rect_thickness), color=rectColor, thickness=-1)
                        cv2.putText(img, text=label, org=(x + 10, y - 10), fontFace=fontFace, fontScale=fontScale, color=fontColor, thickness=fontThickness, lineType=cv2.LINE_AA)
            for label, arrow_dict in draw_arrows.items():
                arrColor = colormap.get(label, defaultColor) if label else defaultColor
                cv2.arrowedLine(img, arrow_dict["start"], arrow_dict["end"],
                                arrColor, rect_thickness, rectLineType)
            path = f"/tmp/{frame_number}_{str(uuid.uuid1())}.jpg"
            cv2.imwrite(path, img)
            paths.append(path)
    return paths

# in: video_path, frames: [{ frame_number, frame_type, boxes: [{ object_type, bbox: {x, y, width, height} }] }]
# out: void; as a side-effect, renders frames from the video with matplotlib
def plot_frames_with_bboxes(video_path, frames, max_cols = 3):
    cols = min(max_cols, len(frames))
    rows = math.ceil(len(frames) / cols)
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(10*cols, 7 * rows))
    if len(frames) > 1:
        [axi.set_axis_off() for axi in axes.ravel()] # Hide axes
    else:
        axes = np.array([axes,])

    frame_paths = render_frames_with_bboxes(video_path, frames)
    for idx, (frame_data, frame_path) in enumerate(zip(frames, frame_paths)):
        axes.ravel()[idx].title.set_text(frame_data['frame_type'])
        axes.ravel()[idx].imshow(mpimg.imread(frame_path, format='jpeg'))
    plt.subplots_adjust(wspace=.05, hspace=.05)
    plt.show()
    


***

# Forecasting Hands and Objects (FHO)
A **video** has one or more **intervals** which contain **actions** made up of **frames** which define **bounding boxes**.

[Data Overview](https://ego4d-data.org/docs/benchmarks/hands-and-objects/)


In [None]:
# This takes long.
with pathmgr.open(os.path.join(CLI_OUTPUT_DIR, VERSION, 'annotations', 'fho_main.json'), "r") as f:
    fho_annotations = json.load(f)

In [None]:
fho_video_uids = [e["video_uid"] for e in fho_annotations["videos"]]
def get_annotation(video_uid='353ae622-c322-443e-95b4-e9927dedfa1c'):
    video_annotation_index = fho_video_uids.index(video_uid)
    return fho_annotations["videos"][video_annotation_index]

In [None]:
debug_show_annotation = False
if debug_show_annotation:
    ann = get_annotation(video_uid="d38f271c-53f3-4771-8c17-a5157e723067")
    print(ann["video_metadata"])
    print(ann["annotated_intervals"][0].keys())

debug_show_narration = False
if debug_show_narration:
    video_i = 1
    inter_i = 0
    action_i = 0
    narr_a = fho_annotations["videos"][video_i]["annotated_intervals"][inter_i]["narrated_actions"][action_i]
    narr_a.keys()

The following section filters videos based on the narration text, however some videos, e.g. (below) are annotated only with fho_scod:object_state_changes.

`d38f271c-53f3-4771-8c17-a5157e723067`

In [None]:
def is_valid_action(action) -> bool:
    if action["is_invalid_annotation"] or action["is_rejected"] or action['stage'] is None:
        return False
    return True

def filter_actions(video, text_q="door ", action_q="open"):
    for interval in video["annotated_intervals"]:
        for narr_action in interval["narrated_actions"]:
            if not is_valid_action(narr_action):
                continue
            if text_q in narr_action["narration_text"] and (action_q is None or narr_action["structured_verb"] == action_q):
                yield narr_action

debug_filter_actions = False
if debug_filter_actions:
    text_q, action_q = "door", "open"
    text_q, action_q = "hammer", "take_(pick,_grab,_get)"
    
    all_sa = []
    all_nt = []
    count = 0
    for video in fho_annotations["videos"]:
        actions = list(filter_actions(video, text_q, action_q))
        count += len(actions)
        for action in actions:
            all_sa.append(action["structured_verb"])
            all_nt.append(action["narration_text"])
            print(video["video_uid"], action["narration_text"], "(", len(actions),"actions )")
    print("done.", count)

In [None]:
from copy import deepcopy
METADATA_PATH = os.path.join(CLI_OUTPUT_DIR, "ego4d.json")

def load_json_from_path(path):
    with open(path) as json_file:
        return json.load(json_file)
 
meta = load_json_from_path(METADATA_PATH)

metadata = {
    video['video_uid']: {
        **{
            k: v
            for k, v in video.items()
                if k != 'video_metadata'
        },
        **video['video_metadata']
    }
    for video in meta['videos']
}
def scale_ratio(video_uid, new_height):
    return float(new_height) / metadata[video_uid]['display_resolution_height']

# scale bboxes - works for the schemas of av, fho_scod, and vq
def scale_bboxes(bboxes, sr):
    bboxes_new = deepcopy(bboxes)
    for i in range(len(bboxes_new)):
        bboxes_new[i]['bbox']['x'] = bboxes[i]['bbox']['x'] * sr
        bboxes_new[i]['bbox']['y'] = bboxes[i]['bbox']['y'] * sr
        bboxes_new[i]['bbox']['width'] = bboxes[i]['bbox']['width'] * sr
        bboxes_new[i]['bbox']['height'] = bboxes[i]['bbox']['height'] * sr
    return bboxes_new

def filter_frames_type(frames, frame_types):
    # frame_types must be in the correct order
    if frames is None:
        return []
    frames = list(filter(lambda x: x['frame_type'] in frame_types, frames))
    frames = sorted(frames, key=lambda x: x is not None and frame_types.index(x['frame_type']))
    return frames

def filter_frames(frames, required_objects):
    frames_new = []
    for i, frame in enumerate(frames):
        seen_objects = [x['object_type'] for x in frame['boxes']]
        if not np.all([x in seen_objects for x in required_objects]):
            continue
        frames_new.append(frame)
    return frames_new

### Plot a specific video

In [None]:
debug_show_video = True
if debug_show_video:
    fho_video_uid = "cde41c4f-50d1-4910-9f2a-4c7b6987df92" # normal door
    #fho_video_uid = "5d8ebbea-6371-4be5-b611-751fb49ec294"  # car door
    #fho_video_uid = "40eeac41-9ec3-4960-b0b3-77074a6ad5b3" # normal door
    #fho_video_uid = "8c335aa9-5caf-4e54-bffb-031ffd452888"
    #fho_video_uid = "aeadb544-fd9f-4bb8-bfbd-fbefd899ec8e"
    text_q="door"; action_q="open"
    
    # If this errors out, make sure you used the cli to download the video with this uid
    fho_video_path = os.path.join(CLI_OUTPUT_DIR, VERSION, 'video_540ss', fho_video_uid + '.mp4')
    print(fho_video_path)
    assert os.path.exists(fho_video_path), f"Video {fho_video_uid} not found. Download it with the cli using: python3 -m ego4d.cli.cli --output_directory=\"<output_dir>\" --datasets full_scale --video_uids={fho_video_uid} --yes"
    
    # Display critical frames for an action as a grid
    frame_order = ['pre_45', 'pre_30', 'pre_15', 'pre_frame', 'contact_frame', 'pnr_frame', 'post_frame']    
    actions = list(filter_actions(get_annotation(fho_video_uid), text_q, action_q))
    
    #action = random.sample(actions, 1)[0]
    action = actions[0]
    
    frames = filter_frames_type(action["frames"], frame_order)
    print("Number of actions:", len(actions), "chose:", actions.index(action), "text:", action["narration_text"])
    
    sr = scale_ratio(fho_video_uid, 540)
    frames2 = deepcopy(frames)
    for fr in frames2:
        fr['boxes'] = scale_bboxes(fr['boxes'], sr)
        
    plot_frames_with_bboxes(fho_video_path, frames2)

In [None]:
import shutil
from pathlib import Path
from copy import deepcopy
from tqdm import tqdm
from PIL import Image
from IPython.display import Image as NBImage
from nb_video_utils2 import frames2hand, get_frame_centers

def fix_object_centers(centers):
    fixed_c = []
    for i in range(len(centers['left_hand'])):
        fixed_point = centers['object_of_change'][i]
        fix_objects = fixed_point - centers['object_of_change'] 
        fixed_objects = centers['object_of_change'] + fix_objects
        right_hand2fixed_obj = centers['right_hand'] + fix_objects
        left_hand2fixed_obj = centers['left_hand'] + fix_objects
        fixed_c.append(dict(object=fixed_objects, left_hand=left_hand2fixed_obj, right_hand=right_hand2fixed_obj,
                            name=centers['names'][i]))
    return fixed_c

def plot_trajectory(idx, centers, frames_dict, hand_choice, act_narration):
    i = idx
    fixed_c = fix_object_centers(centers)
    colors = dict(left_hand='red', right_hand='green', object_of_change='yellow')
    fix, ax = plt.subplots(1)
    frame_array = frames_dict[fixed_c[i]['name']]
    assert isinstance(frame_array, np.ndarray)
    #ax.imshow(frame_array)
    ax.imshow(np.zeros(frame_array.shape))
    ax.set_title(f"{centers['names'][i]}:{hand_choice[0]}: {act_narration}")
    ax.plot(fixed_c[i]['object'][i:, 0], fixed_c[i]['object'][i:, 1], 'x--', color=colors['object_of_change'], label='OoC')
    for hand in ('left_hand', 'right_hand'):
        label = hand + "*" if hand == hand_choice else ""
        ax.plot(fixed_c[i][hand][i:, 0], fixed_c[i][hand][i:, 1], 'x--', color=colors[hand], label=label)
    ax.legend()
    plt.show()

def centers2kp_tensor(centers):
    # kp_tensor: [CH x no_frames*no_persons*no_keypoints]
    # CH: x, y, confidence, mask, frame-number, person-id, keypoint-id
    keypoint_names = ('left_hand', 'right_hand', 'object_of_change')
    no_CH = 7
    no_frames = len(centers['left_hand'])
    no_persons = 1
    no_keypoints = len(keypoint_names)
    kp_tensor = np.zeros((no_CH, no_frames, no_persons, no_keypoints))
    for kp_index, kp_name in enumerate(keypoint_names):
        for frame_index in range(no_frames):
            xy = centers[kp_name][frame_index]
            confidence = 0
            mask = 1 if not np.any(np.isnan(xy)) else 0
            person_id = 0
            keypoint_id = kp_index
            entry = [*xy, confidence, mask, frame_index, person_id, keypoint_id]
            kp_tensor[:, frame_index, 0, kp_index] = entry
    return kp_tensor
    
def loop_videos_and_actions(text_q, action_q, frame_types):
    for video in tqdm(fho_annotations["videos"]):
        fho_video_path = Path(CLI_OUTPUT_DIR) / VERSION / 'video_540ss' / (video["video_uid"] + '.mp4')
        if not fho_video_path.is_file():
            continue
        actions = list(filter_actions(video, text_q, action_q))
        if len(actions) == 0:
            continue
        sr = scale_ratio(video["video_uid"], 540)
        
        for a, action in enumerate(actions):
            frames = filter_frames_type(action["frames"], frame_types)
            if len(frames) == 0:
                continue
            hand_choice = frames2hand(frames)
            frames_f = filter_frames(frames, required_objects=('object_of_change', hand_choice))
            if len(frames_f) < 2:
                continue
                
            frames2 = deepcopy(frames_f)
            for fr in frames2:
                fr['boxes'] = scale_bboxes(fr['boxes'], sr)
                
            yield fho_video_path, frames2, hand_choice, action['narration_text']

plot_dir = Path("/tmp/ego4d_plot/")
plot_dir.mkdir(parents=True, exist_ok=True)
frame_types = ['pre_45', 'pre_30', 'pre_15', 'pre_frame', 'contact_frame', 'pnr_frame', 'post_frame']

#text_q, action_q = "door", "open"
#text_q, action_q = "bottle", "open"
#text_q, action_q = "sponge", "wipe"
#text_q, action_q = "hose", "take_(pick,_grab,_get)"
#text_q, action_q = "", "cut_(trim,_slice,_chop)"
#text_q, action_q = "cup", "take_(pick,_grab,_get)"
text_q, action_q = "", "take_(pick,_grab,_get)"

i = 0
for fho_video_path, frames2, hand_choice, act_narration in loop_videos_and_actions(text_q, action_q, frame_types):
    # render frames
    frame_arrays = get_frames_from_video(fho_video_path, frames2)
    frames_dict = dict([(fr['frame_type'],fn) for fr, fn in zip(frames2, frame_arrays)])
    #plot_frames(frame_paths, frames)

    # get keypoints
    centers = get_frame_centers(frames2)
    kp_tensor = centers2kp_tensor(centers)
    idx = 0
    #plot_trajectory(idx, centers, frames_dict, hand_choice, act_narration)
    
    #print(kp_tensor.shape, hand_choice, act_narration)
    
    max_images = 10
    if i > max_images:
        print(f"Stopping after image limit: {max_images}")
        break

print("done. number of samples", i)

# Keypoint-Diffusion Notes

Potential Data Improvements:
1. Last datapoint is from post_frame, should probably be removed as its quite random.
2. Left-Hand and Right-Hand keypoints are given, hand_choice specifies which is important.
3. (maybe) filter trajectories by pixel-distance heuristic / number of movements that go closer.
4. Improve relative trajectory estimation through grasp point tracking.
5. Estimate hand poses (especially for grasping frame)

Potential Model Improvements:
1. Default Case: condition on object-keypoint
2. Condition on Image
3. Condition on Text

In [None]:
print(frames_dict.keys())
plt.imshow(frames_dict["pre_15"])
plt.show()

# Debugging Stuff

In [None]:
def plot_frames(frame_paths, frames):
    for frame_path, frame_data in zip(frame_paths, frames):
        dest = plot_dir / f'{video["video_uid"]}_{frame_data["frame_number"]:08d}.jpg'
        shutil.copyfile(frame_path, dest)
        #display(NBImage(filename=frame_path))
        #print(f"↑ a:{a} {frame_data['frame_type']} frame num: {frame_data['frame_number']}  -> {dest}\n")

In [None]:
all_nt

In [None]:
from collections import Counter
sa_list = []
for i in range(len(all_nt)):
    if "door" in all_nt[i]:
        print(all_nt[i],"|", all_sa[i])
        sa_list.append(all_sa[i])
Counter(sa_list)

# Original Notebook

In [None]:
# in: video_path, frames: [{ frame_number, frame_label, ?boxes: [{ label, bbox: {x, y, width, height }}] }]
# out: matplotlib.ArtistAnimation of frames rendered with bounding boxes (if provided)
def render_frames_animation(video_path, frames, **kwargs):
    fig, ax = plt.subplots(figsize=(15, 9))
    camera = Camera(fig)
    for frame in frames:
        boxes = frame.get('boxes', [])
        frame_path = render_frame_with_bboxes(video_path, frame['frame_number'], boxes)
        ax.text(0, 1.01, frame['frame_label'], fontsize=20.0, transform=ax.transAxes)
        plt.imshow(mpimg.imread(frame_path, format='jpeg'))
        camera.snap()
    plt.close(fig)
    return camera.animate(**kwargs)

# in: segments: [{<start_key>: int, <end_key>: int}]
# out: void; as a side effect, renders a plot showing all segments passed in
def plot_segments(segments, start_key, end_key):
    ordered_segs = sorted(segments, key=lambda x: x[start_key])
    lines = [[(x[start_key], i), (x[end_key], i)] for i, x in enumerate(ordered_segs)]

    lc = mc.LineCollection(lines, linewidths=2)
    fig, ax = plt.subplots(figsize=(30, 10))
    ax.add_collection(lc)
    ax.autoscale()
    ax.set_xlabel('Frame', fontsize=15)
    ax.set_ylabel('Segment', fontsize=15)
    start, end = ax.get_xlim()
    stepsize = (end-start)/30
    ax.xaxis.set_ticks(np.arange(start, end, stepsize))
    plt.show()
    
# in: track: [ [{<start_key>: int, <end_key>: int, <label>: str}] ]
# out: void; as a side effect, renders a plot showing segments of each track passed in
def plot_multitrack_segments(tracks, start_key, end_key, label_key):
    cmap = plt.cm.get_cmap('tab20')
    color_palette = [cmap(x) for x in range(0, 20)]
    
    lines, colors, patches = [], [], []
    for i, segments in enumerate(tracks):
        lines += [[(x[start_key], i), (x[end_key], i)] for x in segments]
        color = color_palette[i % len(color_palette)]
        colors += [color for _ in segments]
        patches += [mpatches.Patch(color=color, label=segments[0][label_key])]

    lc = mc.LineCollection(lines, colors = colors, linewidths=550/len(tracks))
    fig, ax = plt.subplots(figsize=(30, 10))
    ax.legend(handles=patches, loc='upper left')
    ax.add_collection(lc)
    ax.autoscale()
    ax.set_xlabel('Frame', fontsize=15)
    ax.set_ylabel('Track', fontsize=15)
    start, end = ax.get_xlim()
    stepsize = (end-start)/30
    ax.xaxis.set_ticks(np.arange(start, end, stepsize))
    plt.show()

In [None]:
fho_ann_video_uids = [e["video_uid"] for e in fho_annotations["videos"][:]]
print(f"FHO: {len(fho_ann_video_uids)} videos - top level: {fho_annotations.keys()}")


In [None]:
# Load FHO
with pathmgr.open(os.path.join(CLI_OUTPUT_DIR, VERSION, 'annotations', 'fho_main.json'), "r") as f:
    fho_annotations = json.load(f)
    fho_ann_video_uids = list(fho_annotations['video_data'].keys())
print(f"FHO: {len(fho_ann_video_uids)} videos - top level: {fho_annotations.keys()}")

In [None]:
# Sample one video
fho_video_uid = random.sample(fho_ann_video_uids, 1)[0]
fho_video = videos_df[videos_df.video_uid == fho_video_uid].iloc[0]
print(f"Sampled Video: {fho_video}")

In [None]:
# Summarize FHO Annotations
fho_video_annotations = fho_annotations['video_data'].get(fho_video_uid)
annotation_intervals = fho_video_annotations['annotated_intervals']
print(f"Video Intervals: {len(annotation_intervals)}, uid: {fho_video_uid}")
for interval in annotation_intervals:
    print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
    actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
    print(f"Actions: {len(actions)}")

In [None]:
# If this errors out, make sure you used the cli to download the video with this uid
fho_video_path = os.path.join(CLI_OUTPUT_DIR, VERSION, 'full_scale', fho_video_uid + '.mp4')
assert os.path.exists(fho_video_path), f"Video {fho_video_uid} not found. Download it with the cli using: python3 -m ego4d.cli.cli --output_directory=\"<output_dir>\" --datasets full_scale --video_uids={fho_video_uid} --yes"

In [None]:
# Display critical frames for an action as a grid
frame_order = ['pre_45', 'pre_30', 'pre_15', 'pre_frame', 'contact_frame', 'pnr_frame', 'post_frame']

interval = random.sample(fho_video_annotations['annotated_intervals'], 1)[0]
actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
action = random.sample(actions, 1)[0]
frames = sorted(action['frames'], key=lambda x: frame_order.index(x['frame_type']))
plot_frames_with_bboxes(fho_video_path, frames)

In [None]:
# Display critical frames as an animation
labeled_frames = list(map(
    lambda frame:
        {
            'frame_label': f"{frame['frame_type']}:{frame['frame_number']}",
            **frame
        }, frames))
render_frames_animation(fho_video_path, labeled_frames, interval=500)

In [None]:
# Select a single frame
sample_frame_priority = ['contact_frame', 'pnr_frame', 'pre_frame', 'post_frame', 'pre_15', 'pre_30', 'pre_45']
sample_frame = sorted(action['frames'], key=lambda x: sample_frame_priority.index(x['frame_type']))[0]

In [None]:
# Display bounding boxes on a single frame
sample_frame_path = render_frame_with_bboxes(fho_video_path, sample_frame['frame_number'], sample_frame['boxes'])
plt.rcParams['figure.figsize'] = [20, 20]
plt.title(sample_frame['frame_type'])
plt.axis('off')
plt.imshow(mpimg.imread(sample_frame_path, format='jpeg'))
plt.show()

***

# Visual Object Queries (VQ)
[Data Overview](https://ego4d-data.org/docs/data-overview/#visual-object-queries)

In [None]:
# Load VQ
with pathmgr.open(os.path.join(CLI_OUTPUT_DIR, VERSION, 'annotations', 'vq_train.json'), "r") as f:
    vq_annotations = json.load(f)
    vq_ann_video_uids = [x["video_uid"] for x in vq_annotations["videos"]]
vq_video_dict = {x["video_uid"]: x["clips"] for x in vq_annotations["videos"]}
print(f"VQ: {len(vq_ann_video_uids)} videos - top level: {vq_annotations.keys()}")

In [None]:
# Sample one video
vq_video_uid = random.sample(vq_ann_video_uids, 1)[0]
vq_video = videos_df[videos_df.video_uid == vq_video_uid].iloc[0]
print(f"Sampled Video: {vq_video}")

In [None]:
# Summarize VQ Annotations
vq_video_annotations = vq_video_dict.get(vq_video_uid)
print(f"VQ Video: {len(vq_video_annotations)}, clips: {vq_video_uid}")
print(f"clip keys: {vq_video_annotations[0].keys()}")
print(f"query set keys: {vq_video_annotations[0]['annotations'][0]['query_sets']['1'].keys()}\n")

In [None]:
# If this errors out, make sure you used the cli to download the video with this uid
vq_video_path = os.path.join(CLI_OUTPUT_DIR, VERSION, 'full_scale', vq_video_uid + '.mp4')
assert os.path.exists(vq_video_path), f"Video {vq_video_uid} not found. Download it with the cli using: python3 -m ego4d.cli.cli --output_directory=\"<output_dir>\" --datasets full_scale --video_uids={vq_video_uid} --yes"

In [None]:
# Display visual crop frame
query_set = vq_video_annotations[0]['annotations'][0]['query_sets']['1']
object_frame = query_set['visual_crop']
box = {
    'object_type': query_set['object_title'],
    'bbox': {
        'x': object_frame['x'],
        'y': object_frame['y'],
        'width': object_frame['width'],
        'height': object_frame['height'],
    }
}
object_frame_path = render_frame_with_bboxes(vq_video_path, object_frame['video_frame_number'], [box])
plt.rcParams['figure.figsize'] = [20, 20]
plt.imshow(mpimg.imread(object_frame_path, format='jpeg'))
plt.show()

In [None]:
# Display animated response track
response_track_frames = query_set['response_track']
reformatted_frames = list(map(
    lambda frame:
        {
            'frame_number': frame['video_frame_number'],
            'frame_label': f"Frame: {frame['video_frame_number']}",
            'boxes': [{
                'object_type': query_set['object_title'],
                'bbox': {
                    'x': frame['x'],
                    'y': frame['y'],
                    'width': frame['width'],
                    'height': frame['height'],
                }
            }]
        }, response_track_frames))
render_frames_animation(vq_video_path, reformatted_frames, interval=200)

# Natural Language Queries (NLQ)

[Data Overview](https://ego4d-data.org/docs/data-overview/#natural-language-queries)

In [None]:
# Load NLQ
with pathmgr.open(os.path.join(CLI_OUTPUT_DIR, VERSION, 'annotations', 'nlq_train.json'), "r") as f:
    nlq_annotations = json.load(f)
    nlq_ann_video_uids = [x["video_uid"] for x in nlq_annotations["videos"]]
nlq_video_dict = {x["video_uid"]: x["clips"] for x in nlq_annotations["videos"]}
print(f"NLQ: {len(nlq_ann_video_uids)} videos - top level: {nlq_annotations.keys()}")

In [None]:
# Sample one video
nlq_video_uid = random.sample(nlq_ann_video_uids, 1)[0]
nlq_video = videos_df[videos_df.video_uid == nlq_video_uid].iloc[0]
print(f"Sampled Video: {nlq_video}")

In [None]:
# Summarize NLQ Annotations
nlq_video_annotations = nlq_video_dict.get(nlq_video_uid)
print(f"NLQ Video: {len(nlq_video_annotations)}, clips: {nlq_video_uid}")
print(f"clip keys: {nlq_video_annotations[0].keys()}")
print(f"language_query keys: {nlq_video_annotations[0]['annotations'][0]['language_queries'][0].keys()}\n")

In [None]:
# If this errors out, make sure you used the cli to download the video with this uid
nlq_video_path = os.path.join(CLI_OUTPUT_DIR, VERSION, 'full_scale', nlq_video_uid + '.mp4')
assert os.path.exists(nlq_video_path), f"Video {nlq_video_uid} not found. Download it with the cli using: python3 -m ego4d.cli.cli --output_directory=\"<output_dir>\" --datasets full_scale --video_uids={nlq_video_uid} --yes"

In [None]:
# Display the clip for a language query
nlq_clip = random.sample(nlq_video_annotations, 1)[0]
nlq_clip_annotations = random.sample(nlq_clip['annotations'], 1)[0]
sample_nlq_language_query = random.sample(nlq_clip_annotations['language_queries'], 1)[0]

[print(k.capitalize(),':',v) for k, v in sample_nlq_language_query.items()]

# only render 15 frames for performance
nlq_stepsize = (sample_nlq_language_query["video_end_frame"] - sample_nlq_language_query["video_start_frame"]) // 15 
render_frames_animation(
    nlq_video_path,
    [
        {"frame_number": x, "frame_label": f"Frame {x}"}
        for x in range(
            sample_nlq_language_query["video_start_frame"],
            sample_nlq_language_query["video_end_frame"],
            nlq_stepsize
        )
    ],
    interval = 400
)

***

# Audio-Visual Diarization (AV)
[Data Overview](https://ego4d-data.org/docs/benchmarks/AV-diarization/)

In [None]:
# Load AV
with pathmgr.open(os.path.join(CLI_OUTPUT_DIR, VERSION, 'annotations', 'av_train.json'), "r") as f:
    av_annotations = json.load(f)
    av_ann_video_uids = [x["video_uid"] for x in av_annotations["videos"]]
av_video_dict = {x["video_uid"]: x["clips"] for x in av_annotations["videos"]}
print(f"AV: {len(av_ann_video_uids)} videos - top level: {av_annotations.keys()}")

In [None]:
# Sample one video
av_video_uid = random.sample(av_ann_video_uids, 1)[0]
av_video = videos_df[videos_df.video_uid == av_video_uid].iloc[0]
print(f"Sampled Video: {av_video}")

In [None]:
# Summarize AV Annotations
av_video_annotations = av_video_dict.get(av_video_uid)
print(f"AV Video: {len(av_video_annotations)}, clips: {av_video_uid}")
print(f"clip keys: {av_video_annotations[0].keys()}")
print(f"tracked people: {len(av_video_annotations[0]['persons'])}")
print(f"person keys: {av_video_annotations[0]['persons'][0].keys()}")

In [None]:
# Aggregate av tracking bounding boxes in an indexable dictionary
def get_av_frame_dict(av_video_annotation):
    frame_aggregator = {}
    for person in av_video_annotation['persons']:
        for tracking_path in person['tracking_paths']:
            for track in tracking_path['track']:
                frame = frame_aggregator.get(track['video_frame'])
                if frame is None:
                    frame = {
                        "frame_number": track['video_frame'],
                        "frame_label": f"Frame: {track['video_frame']}",
                        "frame_type": f"Frame: {track['video_frame']}",
                        "boxes": []
                    }
                frame['boxes'].append({
                    "object_type": tracking_path['track_id'],
                    "bbox": {
                        "x": track['x'],
                        "y": track['y'],
                        "width": track['width'],
                        "height": track['height']
                    }
                })
                frame_aggregator[track['video_frame']] = frame
    return frame_aggregator

    
# Get ordered list of frames 
def get_av_frames_with_bboxes(av_video_annotation):
    frame_dict = get_av_frame_dict(av_video_annotation)
    return sorted(list(frame_dict.values()), key=lambda x: x['frame_number'])

In [None]:
# If this errors out, make sure you used the cli to download the video with this uid
av_video_path = os.path.join(CLI_OUTPUT_DIR, VERSION, 'full_scale', av_video_uid + '.mp4')
assert os.path.exists(av_video_path), f"Video {av_video_uid} not found. Download it with the cli using: python3 -m ego4d.cli.cli --output_directory=\"<output_dir>\" --datasets full_scale --video_uids={av_video_uid} --yes"

In [None]:
# Aggregate frames from av person tracking
av_tracked_frame_dict = get_av_frame_dict(av_video_annotations[0])
av_tracked_frames = get_av_frames_with_bboxes(av_video_annotations[0])

In [None]:
# Plot a random sample of tracked frames
plot_frames_with_bboxes(av_video_path, random.sample(av_tracked_frames, 9))

In [None]:
# Show an animation of a subset of frames
render_frames_animation(av_video_path, av_tracked_frames[:25], interval=60)

In [None]:
# Plot a single random tracked av frame
sample_frame = random.sample(av_tracked_frames, 1)[0]

frame_path = render_frame_with_bboxes(av_video_path, sample_frame['frame_number'], sample_frame['boxes'])
plt.rcParams['figure.figsize'] = [20, 20]
plt.imshow(mpimg.imread(frame_path, format='jpeg'))
plt.title(f"Frame: {sample_frame['frame_number']}")
plt.show()

In [None]:
# Visualize all social segments in the video
plot_segments(av_video_annotations[0]['social_segments'], 'video_start_frame', 'video_end_frame')

In [None]:
# Visualize all peoples' voice segments
person_voice_track_segments = [x['voice_segments'] for x in
                               av_video_annotations[0]['persons']
                               if len(x['voice_segments']) > 0]
for track in person_voice_track_segments:
    for segment in track:
        segment['label'] = f"Person {segment['person']}"
ordered_tracks = sorted(person_voice_track_segments, key=lambda x: x[0]['video_start_frame'])
plot_multitrack_segments(ordered_tracks, 'video_start_frame', 'video_end_frame', 'label')

# Searching for Actions

In [None]:
# show an example of an narrated action
#fho_annotations["videos"][11]["annotated_intervals"][0]["narrated_actions"][0]

In [None]:
def get_objects(action):
    sn = set()
    frames = action['frames']
    if frames is None:
        return []
    for frame in action['frames']:
        for box in frame['boxes']:
            sn.add(box['_structured_noun'])
    if None in sn:
        sn.remove(None)
    return list(sn)


all_sa = []
all_nt = []
all_sn = []
for video in fho_annotations["videos"]:
    for interval in video["annotated_intervals"]:
        for action in interval["narrated_actions"]:
            if not is_valid_action(action):
                continue
            sv = action["structured_verb"]
            #if sv != 'take_(pick,_grab,_get)':
            #    continue
            
            all_sa.append(action["structured_verb"])
            all_nt.append(action["narration_text"])
            sn = get_objects(action)
            if len(sn) == 0:
                sn = [""]
            if len(sn) > 1:
                sn = sn[:1]
            all_sn.extend(sn)
            #print(video["video_uid"], action["narration_text"])

In [None]:
Counter(all_sa)

In [None]:
from collections import Counter
mask_sa = np.array(all_sa) == 'take_(pick,_grab,_get)'
mask_sn = np.array(all_sn) == 'container_(box,_can,_carton,_case,_casing,_container,_crate,_holder,_jar,_jerrycan,_keg,_pack,_package,_packaging,_packet,_storage,_tank,_tin)'
mask = np.logical_and(mask_sa,mask_sn)

sel_nt = np.array(all_nt)[mask]

print(Counter(sel_nt))

In [None]:
from collections import Counter
object_names = ('bowl', 'cup', 'glass', 'bottle','box','knife', 'hammer','pen','carrot')
#object_names = ('mug', 'bowl', 'bottle', 'box', 'can', 'headphones')

for obj_name in object_names:
    print(obj_name.ljust(10), len([True for x in all_nt if obj_name in x]))

In [None]:
import json
with open("narration_text_pick.json", 'w') as f:
    json.dump(all_nt, f)

In [None]:
for k,v in metadata.items():
    print(v.keys())
    break
    
stereo = []
for k,v in metadata.items():
    stereo.append(v['imu_metadata'] is not None)
    
    #print(stereo)
    #break
#print(stereo)
#plt.hist(stereo)
#plt.show()
np.mean(stereo)

# Notes: 
mono-depth-estimation: https://github.com/EPFL-VILAB/omnidata/tree/main/omnidata_tools/torch#readme

STA training code: https://ego4d-data.org/docs/challenge/

EgoTracks: object tracking