
# process dataset and prepare for training

### pip install datasets opencv-python pillow tqdm 
### pip install huggingface_hub
### pip3 install torch torchvision(cpu)

In [1]:
import os
import json
import cv2
import glob
import shutil
import tarfile
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from huggingface_hub import snapshot_download


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#  + labels.json 
DATA_DIR = "D:/A-computer files/Deep Learning/Final/data/something_v2"   # dataset directory

# processed data directory
SAVE_ROOT = "D:/A-computer files/Deep Learning/Final/processed_something_v2"
os.makedirs(SAVE_ROOT, exist_ok=True)

print("data directory:", DATA_DIR)
print("output directory:", SAVE_ROOT)

data directory: D:/A-computer files/Deep Learning/Final/data/something_v2
output directory: D:/A-computer files/Deep Learning/Final/processed_something_v2


### Download the dataset from Hugging Face Hub https://huggingface.co/datasets/HuggingFaceM4/something_something_v2

In [None]:

# Download the dataset from Hugging Face Hub
snapshot_download(
    repo_id="HuggingFaceM4/something_something_v2",
    repo_type="dataset",
    cache_dir=DATA_DIR
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 5 files: 100%|██████████| 5/5 [00:01<00:00,  3.35it/s]


'D:/A-computer files/Deep Learning/Final/data/something_v2\\datasets--HuggingFaceM4--something_something_v2\\snapshots\\130db220f301e31219875231983a9827c8370aa1'

### Download datasets by hand from https://www.qualcomm.com/developer/software/something-something-v-2-dataset/downloads

In [18]:
# Concatenate split tar.gz parts in a cross-platform way using Python.
# It reads all parts matching the pattern, sorts them, and writes them into a single output file.

parts = sorted(glob.glob("20bn-something-something-v2-*.tar.gz"))
if not parts:
	raise FileNotFoundError("No files found matching pattern: 20bn-something-something-v2-*.tar.gz")

output_path = "videos.tar.gz"
with open(output_path, "wb") as outfile:
	for part in parts:
		with open(part, "rb") as infile:
			shutil.copyfileobj(infile, outfile)

print(f"Combined {len(parts)} parts into {output_path}")

Combined 2 parts into videos.tar.gz


In [None]:

# Ensure target directory exists (os is already imported in another cell)
os.makedirs(DATA_DIR, exist_ok=True)

# Use the already-created output_path (20bn-something-something-v2.tar.gz)
with tarfile.open(output_path, "r:gz") as tar:
	tar.extractall(path=DATA_DIR)

print(f"Extracted {output_path} to {DATA_DIR}")

Remember to change the directory name from 20bn-something-something-v2 to videos

In [3]:
# Load the dataset from the local extracted directory using Hugging Face Datasets
dataset = load_dataset(DATA_DIR)
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['id', 'label', 'template', 'placeholders'],
        num_rows: 168913
    })
    validation: Dataset({
        features: ['id', 'label', 'template', 'placeholders'],
        num_rows: 24777
    })
    test: Dataset({
        features: ['id', 'label', 'template', 'placeholders'],
        num_rows: 27157
    })
})
{'id': '78687', 'label': 'holding potato next to vicks vaporub bottle', 'template': 'Holding [something] next to [something]', 'placeholders': ['potato', 'vicks vaporub bottle']}


In [4]:
TASK_KEYWORDS = {
    "move_object": [
        "moving", 
        "pushing", 
    ],
    "drop_object": [
        "dropping", 
        "letting something roll down"
    ],
    "cover_object": [
        "covering", 
        "putting",
    ],
    # extra tasks including opening, throwing and catching, pulling
    "open_object": [
        "opening",
        "unfolding"
    ],
    "throw_and_catch_object": [
        "throwing", 
        "catching"
    ],
    "pull_object": [
        "pulling", 
        "dragging"
    ],
}

def match_task(label):
    label = label.lower()
    for task, keys in TASK_KEYWORDS.items():
        for k in keys:
            if k in label:
                return task
    return None


In [5]:
# Extract 21 frames uniformly from the video 
# 20 for training, 1 target output
def extract_frames(video_path, n_frames=21):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total < n_frames:
        cap.release()
        return None
    
    idxs = np.linspace(0, total - 1, n_frames, dtype=int)
    frames = []

    for idx in idxs:
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            cap.release()
            return None
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frames.append(frame)

    cap.release()
    return frames


In [6]:
sample_count = {k: 0 for k in TASK_KEYWORDS.keys()}

def resolve_video_path(item):
    # Try common keys that might contain a path
    if isinstance(item, dict):
        for key in ("videos", "video", "video_path", "file_path", "path"):
            p = item.get(key)
            if p:
                # some datasets store a list/tuple
                if isinstance(p, (list, tuple)) and p:
                    return p[0]
                return p

    # Fall back to constructing path from id and common directories/extensions
    vid_id = item.get("id") if isinstance(item, dict) else None
    if vid_id is None:
        return None

    search_dirs = [
        os.path.join(DATA_DIR, "videos"),
        DATA_DIR,
    ]
    exts = [".mp4", ".webm", ".avi", ".mov", ".mkv", ".mpeg"]

    for d in search_dirs:
        if not d:
            continue
        for ext in exts:
            candidate = os.path.join(d, vid_id + ext)
            if os.path.exists(candidate):
                return candidate

    # As a last resort, try a recursive glob search for files containing the id
    pattern = os.path.join(DATA_DIR, "**", vid_id + "*")
    matches = glob.glob(pattern, recursive=True)
    if matches:
        return matches[0]

    return None

In [None]:
#training set processing

for item in tqdm(dataset["train"], desc="Processing"):
    label = item.get("label", "")
    task = match_task(label)
    if task is None:
        continue

    # Resolve video path robustly to avoid KeyError
    video_path = resolve_video_path(item)
    if video_path is None or not os.path.exists(video_path):
        # Could not find a corresponding video file for this sample
        continue

    frames = extract_frames(video_path, n_frames=21)
    if frames is None:
        continue

    # Resize to 128x128 (>= 96x96 is fine)  
    frames = [cv2.resize(f, (128, 128)) for f in frames]

    # prepare output path
    vid_id = str(sample_count[task]).zfill(5)
    vid_dir = os.path.join(SAVE_ROOT, "train", task, f"video_{vid_id}")
    os.makedirs(os.path.join(vid_dir, "frames"), exist_ok=True)

    # save frames
    for i, frame in enumerate(frames):
        save_p = os.path.join(vid_dir, "frames", f"{i:04d}.png")
        cv2.imwrite(save_p, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    # save metadata
    with open(os.path.join(vid_dir, "meta.json"), "w") as f:
        json.dump({"label": label}, f, indent=4)

    sample_count[task] += 1

print("Done! Data statistics:", sample_count)


Processing: 100%|██████████| 168913/168913 [6:23:47<00:00,  7.34it/s]  

Done! Data statistics: {'move_object': 29262, 'drop_object': 4244, 'cover_object': 22918, 'open_object': 3226, 'throw_and_catch_object': 5539, 'pull_object': 5565, 'tilting_object': 1402, 'stacking_object': 952, 'pouring_object': 1488, 'rolling_object': 2817}





In [11]:
#val set processing

for item in tqdm(dataset["validation"], desc="Processing"):
    label = item.get("label", "")
    task = match_task(label)
    if task is None:
        continue

    # Resolve video path robustly to avoid KeyError
    video_path = resolve_video_path(item)
    if video_path is None or not os.path.exists(video_path):
        # Could not find a corresponding video file for this sample
        continue

    frames = extract_frames(video_path, n_frames=21)
    if frames is None:
        continue

    # Resize to 128x128 (>= 96x96 is fine)  
    frames = [cv2.resize(f, (128, 128)) for f in frames]

    # prepare output path
    vid_id = str(sample_count[task]).zfill(5)
    vid_dir = os.path.join(SAVE_ROOT, "validation", task, f"video_{vid_id}")
    os.makedirs(os.path.join(vid_dir, "frames"), exist_ok=True)

    # save frames
    for i, frame in enumerate(frames):
        save_p = os.path.join(vid_dir, "frames", f"{i:04d}.png")
        cv2.imwrite(save_p, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    # save metadata
    with open(os.path.join(vid_dir, "meta.json"), "w") as f:
        json.dump({"label": label}, f, indent=4)

    sample_count[task] += 1

print("Validation Done! Data statistics:", sample_count)

Processing: 100%|██████████| 24777/24777 [42:15<00:00,  9.77it/s]  

Validation Done! Data statistics: {'move_object': 3451, 'drop_object': 736, 'cover_object': 2864, 'open_object': 667, 'throw_and_catch_object': 542, 'pull_object': 792}





In [7]:
# test set processing
test_count = 0 

for item in tqdm(dataset["test"], desc="Processing Test"):
    
    # resolve video path
    video_path = resolve_video_path(item)
    if video_path is None or not os.path.exists(video_path):
        continue

    # extract 21 frames
    frames = extract_frames(video_path, n_frames=21)
    if frames is None or len(frames) < 21:
        continue

    # resize frames
    frames = [cv2.resize(f, (128, 128)) for f in frames]

    # split frames
    frames_input = frames[:20]      # first 20 frames as input
    frame_gt = frames[20]           # 21st frame as ground truth

    # output directory
    vid_id = str(test_count).zfill(5)
    vid_dir = os.path.join(SAVE_ROOT, "test", f"video_{vid_id}")
    os.makedirs(os.path.join(vid_dir, "frames"), exist_ok=True)

    # save 20 input frames
    for i, frame in enumerate(frames_input):
        save_p = os.path.join(vid_dir, "frames", f"{i:04d}.png")
        cv2.imwrite(save_p, cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    # save 21st frame as ground truth
    gt_path = os.path.join(vid_dir, "frames", "gt.png")
    cv2.imwrite(gt_path, cv2.cvtColor(frame_gt, cv2.COLOR_RGB2BGR))

    test_count += 1

print("Test Done! Total samples:", test_count)



Processing Test: 100%|██████████| 27157/27157 [2:02:57<00:00,  3.68it/s]  

Test Done! Total samples: 24569





dataloader by train/val and test

In [10]:
import os
import json
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image
import glob


class HOIDatasetTrainVal(Dataset):
    def __init__(self, root):
        self.items = []
        
        tasks = os.listdir(root)
        for t in tasks:
            tdir = os.path.join(root, t)
            if not os.path.isdir(tdir):
                continue

            videos = os.listdir(tdir)
            for vid in videos:
                vid_dir = os.path.join(tdir, vid)
                meta_path = os.path.join(vid_dir, "meta.json")
                frames_dir = os.path.join(vid_dir, "frames")
                
                frames = sorted(glob.glob(os.path.join(frames_dir, "*.png")))
                # need exactly 21 frames
                if len(frames) != 21:
                    continue
                
                self.items.append((frames, meta_path))

        self.tf = transforms.ToTensor()

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        frames, meta_path = self.items[idx]

        # read label
        with open(meta_path, "r") as f:
            meta = json.load(f)
        text = meta["label"]

        # 20 input frames
        input_frames = [self.tf(Image.open(frames[i])) for i in range(20)]
        input_frames = torch.stack(input_frames)

        # 21st as target
        target = self.tf(Image.open(frames[20]))

        return {
            "input_frames": input_frames,  # [20, 3, 128, 128]
            "text": text,
            "target_frame": target        # [3, 128, 128]
        }


In [11]:
class HOIDatasetTest(Dataset):
    def __init__(self, root):
        self.items = []

        videos = os.listdir(root)
        for vid in videos:
            vid_dir = os.path.join(root, vid)
            frames_dir = os.path.join(vid_dir, "frames")

            input_frames = sorted(
                [f for f in glob.glob(os.path.join(frames_dir, "*.png")) 
                 if "gt" not in f]
            )

            gt_path = os.path.join(frames_dir, "gt.png")

            # need 20 input + 1 gt
            if len(input_frames) != 20:
                continue
            if not os.path.exists(gt_path):
                continue

            self.items.append((input_frames, gt_path))

        self.tf = transforms.ToTensor()

    def __len__(self):
        return len(self.items)

    def __getitem__(self, idx):
        frames_20, gt = self.items[idx]

        input_frames = [self.tf(Image.open(f)) for f in frames_20]
        input_frames = torch.stack(input_frames)

        target = self.tf(Image.open(gt))

        return {
            "input_frames": input_frames,  # [20, 3, 128, 128]
            "target_frame": target         # gt.png
        }


In [None]:
from torch.utils.data import DataLoader

train_ds = HOIDatasetTrainVal(os.path.join(SAVE_ROOT, "train"))
val_ds   = HOIDatasetTrainVal(os.path.join(SAVE_ROOT, "validation"))
test_ds  = HOIDatasetTest(os.path.join(SAVE_ROOT, "test"))

train_dl = DataLoader(train_ds, batch_size=4, shuffle=True)
val_dl   = DataLoader(val_ds, batch_size=4, shuffle=False)
test_dl  = DataLoader(test_ds, batch_size=4, shuffle=False)

batch = next(iter(train_dl))
print(batch["input_frames"].shape)   # [4, 20, 3, 128, 128]
print(batch["target_frame"].shape)   # [4, 3, 128, 128]
print(batch["text"])   