In [6]:
import os
import json
from torch.utils.data import Dataset

class Ego4DDataset(Dataset):
    def __init__(
        self, 
        root='/datasets01/ego4d_track2/v2_1',
        videos_root='/datasets01/ego4d_track2/v2_1/full_scale',
        narration_pass=1
    ):
        """
        Args:
            root (str): Path to the ego4d dataset root directory, containing ego4d.json and annotations/narration.json.
            videos_root (str): Path to the directory containing full-scale ego4d videos.
            narration_pass (int): Which narration pass to use (1 or 2).
        """

        self.root = root
        self.videos_root = videos_root
        if narration_pass not in [1, 2]:
            raise ValueError("narration_pass must be either 1 or 2")
        self.narration_pass_key = f"narration_pass_{narration_pass}"

        # Load metadata
        metadata_path = os.path.join(self.root, 'ego4d.json')
        with open(metadata_path, 'r') as f:
            metadata = json.load(f)
        self.samples = metadata['videos']

        # Load narrations
        narrations_path = os.path.join(self.root, 'annotations', 'narration.json')
        with open(narrations_path, 'r') as f:
            self.narrations = json.load(f)
        
        # No filtering needed

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        video_info = self.samples[idx]
        video_uid = video_info['video_uid']

        # Construct path to video
        video_path = os.path.join(self.videos_root, video_uid + '.mp4')

        # Get narration data
        video_narration = self.narrations.get(video_uid, {})
        narration_data = video_narration.get(self.narration_pass_key, {})

        # Use dense narrations: convert timestamps into segments
        segments_data = narration_data.get('narrations', [])
        # First extract timestamps and descriptions
        timestamps = [seg['timestamp_sec'] for seg in segments_data]
        descriptions = [seg['narration_text'] for seg in segments_data]

        sample = {
            "video_path": video_path,
            "timestamps":timestamps,
            "descriptions": descriptions
        }

        return sample

In [11]:
import cv2
import numpy as np
from PIL import Image

def load_frames(video_path, timestamps):
    """
    Load frames from a video at given timestamps.

    Args:
        video_path (str): Path to the video file.
        timestamps (list of float): List of timestamps (in seconds) to load frames from.

    Returns:
        frames (list of np.ndarray): List of frames (in RGB).
    """
    frames = []
    cap = cv2.VideoCapture(video_path)
    for t in timestamps:
        cap.set(cv2.CAP_PROP_POS_MSEC, t * 1000)
        ret, frame = cap.read()
        if not ret or frame is None:
            # Create a black frame if we can't read from the video
            # frame = np.zeros((100, 100, 3), dtype=np.uint8)
            print(f"Warning: Could not read frame at time {t} from video {video_path}")
            frames.append(None)
        else:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frames.append(Image.fromarray(frame))
    cap.release()
    return frames

In [12]:
dataset = Ego4DDataset()

In [16]:
from matplotlib import pyplot as plt
import random

sample = dataset[random.randint(0, len(dataset))]

print(sample['video_path'])
print(len(sample['timestamps']), sample['timestamps'])
print(len(sample['descriptions']), (sample['descriptions']))


frames = load_frames(sample['video_path'], sample['timestamps'])


for i, frame in enumerate(frames):
    plt.imshow(frame)
    plt.title(f"{sample['descriptions'][i]}")
    plt.show()