In [11]:
# !pip install torchcodec

In [24]:
# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
import os
import json
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from torchcodec.decoders import VideoDecoder

class WLASLTorchCodec(Dataset):
  def __init__(self, json_path, video_dir, split="train", num_frames=32, transform=None):
    self.video_dir = video_dir
    self.num_frames = num_frames
    self.transform = transform

    # Read json
    with open(json_path, "r") as f:
      data = json.load(f)

    self.samples = []
    self.label_map = {}
    label_id = 0

    for entry in data:
      gloss = entry["gloss"]

      if gloss not in self.label_map:
        self.label_map[gloss] = label_id
        label_id += 1

      label = self.label_map[gloss]

      for inst in entry["instances"]:
        if inst["split"] != split:
          continue

        video_id = inst["video_id"]
        file_path = os.path.join(video_dir, f"{video_id}.mp4")

        if os.path.isfile(file_path):
          self.samples.append((file_path, label))

  def __len__(self):
    return len(self.samples)

  def _sample_frames(self, frames):
    """frames is a list of PIL images or Tensors."""
    T = len(frames)
    idx = torch.linspace(0, T - 1, self.num_frames).long()
    return [frames[i] for i in idx]

  def __getitem__(self, idx):
    video_path, label = self.samples[idx]

    decoder = VideoDecoder(video_path)

    frames = []
    for chunk in decoder:
      # chunk is a Tensor of frames: (T, H, W) OR (T, H, W, C)
      for frame_tensor in chunk:
        # Handle grayscale frames (H, W)
        if frame_tensor.dim() == 2:
          frame_tensor = frame_tensor.unsqueeze(2)  # â†’ (H, W, 1)

        # Handle RGB frames (H, W, C)
        if frame_tensor.dim() == 3:
          pass
        else:
          raise ValueError(f"Unexpected frame shape: {frame_tensor.shape}")

        # Convert to C x H x W
        frame_chw = frame_tensor.permute(2, 0, 1)

        frame_pil = transforms.ToPILImage()(frame_chw)
        frames.append(frame_pil)

    # Guard for short videos
    if len(frames) < self.num_frames:
        while len(frames) < self.num_frames:
            frames.extend(frames)
        frames = frames[: self.num_frames]

    # Uniform sampling
    frames = self._sample_frames(frames)

    if self.transform:
      frames = torch.stack([self.transform(f) for f in frames])
    else:
      frames = torch.stack([transforms.ToTensor()(f) for f in frames])

    return frames, label


In [26]:
# Path to json file in drive
json_path = "/content/drive/My Drive/Colab Notebooks/WLASL_v0.3.json"

with open(json_path, "r") as f:
  data = json.load(f)

if "root" in data:
  data = data["root"]

# Create dataset
dataset = WLASLTorchCodec(
  json_path=json_path,
  video_dir="/content/drive/My Drive/Colab Notebooks/videos",
  split="train",
  num_frames=8
)

# Test a sample
frames, label = dataset[0]
print("Frames shape:", frames.shape)
print("Label:", label)


Frames shape: torch.Size([8, 1, 256, 256])
Label: 0
