# Handwash Dataset Validation Notebook

This notebook downloads and validates all four datasets (kaggle, pskus, metc, synthetic_blender_rozakar).
It provides exploratory analysis, shows sample videos and frames for each class, and reports mapped classes after preprocessing.


In [None]:
# Install dependencies
!pip install -q --no-cache-dir scikit-learn pandas numpy opencv-python-headless matplotlib seaborn tqdm requests gdown zenodo-get ipython


In [None]:

import os, sys, json, math, random, shutil, subprocess, re
from pathlib import Path
from typing import List, Dict, Optional

import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from IPython.display import Video, display
from sklearn.model_selection import train_test_split

np.random.seed(42)
random.seed(42)

DATA_ROOT = Path(os.environ.get("HANDWASH_DATA", "/kaggle/working/handwash_data"))
RAW_DIR = DATA_ROOT / "raw"
PROCESSED_DIR = DATA_ROOT / "processed"
RAW_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

DATASETS = ["kaggle", "pskus", "metc", "synthetic_blender_rozakar"]

IMG_SIZE = (224, 224)
NUM_CLASSES = 7
CLASS_NAMES = [
    "Other",
    "Step1_PalmToPalm",
    "Step2_PalmOverDorsum",
    "Step3_InterlacedFingers",
    "Step4_BackOfFingers",
    "Step5_ThumbRub",
    "Step6_Fingertips",
]

PSKUS_CODE_MAPPING = {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
    7: 0,
}

METC_CODE_MAPPING = {
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 4,
    5: 5,
    6: 6,
}

LABEL_TOKENS = {
    "step1": 1,
    "step2": 2,
    "step3": 3,
    "step4": 4,
    "step5": 5,
    "step6": 6,
    "other": 0,
}

SYNTHETIC_GESTURE_TO_CLASS = {
    1: 1,
    2: 2,
    3: 2,
    4: 3,
    5: 4,
    6: 5,
    7: 5,
    8: 6,
}

VIDEO_EXTS = (".mp4", ".avi", ".mov", ".mkv")
IMAGE_EXTS = (".jpg", ".jpeg", ".png")

FRAME_SKIP = 2
MAX_VIDEOS_PER_DATASET = None  # set to int to limit processing
MAX_FRAMES_PER_VIDEO = None  # set to int to limit per-video frame extraction

KAGGLE_URL = "https://github.com/atiselsts/data/raw/master/kaggle-dataset-6classes.tar"
PSKUS_ZENODO = "4537209"
METC_ZENODO = "5808789"
SYNTHETIC_LINKS = [
    "https://drive.google.com/uc?id=1EW3JQvElcuXzawxEMRkA8YXwK_Ipiv-p&export=download",
    "https://drive.google.com/uc?id=163TsrDe4q5KTQGCv90JRYFkCs7AGxFip&export=download",
    "https://drive.google.com/uc?id=1GxyTYfSodumH78NbjWdmbjm8JP8AOkAY&export=download",
    "https://drive.google.com/uc?id=1IoRsgBBr8qoC3HO-vEr6E7K4UZ6ku6-1&export=download",
    "https://drive.google.com/uc?id=1svCYnwDazy5FN1DYSgqbGscvDKL_YnID&export=download",
]
# Partial download controls (None = download all)
PSKUS_DATASET_IDS = None  # e.g. [1, 2] to fetch subset of DataSet*.zip
METC_INTERFACE_IDS = None  # e.g. [1] to fetch subset of Interface_number_*.zip
SYNTHETIC_MAX_ZIPS = None  # e.g. 1 to limit number of synthetic zips
DOWNLOAD_PSKUS_SPLIT_CSV = True
DOWNLOAD_METC_CSV = True

LOG_DIR = DATA_ROOT / "logs" / "validation"
LOG_DIR.mkdir(parents=True, exist_ok=True)

RAW_SAMPLE_VIDEOS = 2
SAMPLES_PER_CLASS = 3
SAMPLE_VIDEO_FRAMES = 24
SAMPLE_VIDEO_FPS = 6

SAVE_LOGS = True
SAVE_SAMPLE_MEDIA = True
CALCULATE_DISK_USAGE = True
CLEANUP_BETWEEN_DATASETS = True
CLEANUP_ON_FAILURE = False
CLEANUP_ARCHIVES = True

ALL_MAPPINGS = []


print("RAW_DIR:", RAW_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)


In [None]:

def download_with_progress(url: str, dest: Path):
    dest.parent.mkdir(parents=True, exist_ok=True)
    if dest.exists():
        print("skip", dest)
        return
    import requests
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0))
        with open(dest, "wb") as f, tqdm(total=total, unit="B", unit_scale=True, desc=dest.name) as pbar:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
                    pbar.update(len(chunk))


def extract_tar(tar_path: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    import tarfile
    with tarfile.open(tar_path) as tfp:
        tfp.extractall(out_dir)
    tar_path.unlink(missing_ok=True)


def extract_zip(zip_path: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    import zipfile
    with zipfile.ZipFile(zip_path) as zf:
        zf.extractall(out_dir)
    zip_path.unlink(missing_ok=True)


def _extract_archives_if_needed(raw_dir: Path):
    zip_files = sorted(raw_dir.glob("*.zip"))
    tar_files = sorted(raw_dir.glob("*.tar*"))
    if zip_files or tar_files:
        print("Extracting existing archives in", raw_dir)
    for zip_file in zip_files:
        extract_zip(zip_file, raw_dir)
    for tar_file in tar_files:
        extract_tar(tar_file, raw_dir)


def download_kaggle():
    out_dir = RAW_DIR / "kaggle"
    out_dir.mkdir(parents=True, exist_ok=True)
    tar_path = out_dir / "kaggle-dataset-6classes.tar"
    download_with_progress(KAGGLE_URL, tar_path)
    print("Extracting kaggle...")
    extract_tar(tar_path, out_dir)
    return out_dir


def _zenodo_file_url(record_id: str, filename: str) -> str:
    return f"https://zenodo.org/record/{record_id}/files/{filename}?download=1"


def download_zenodo(zenodo_id: str, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    cmd = ["zenodo_get", "-r", zenodo_id, "-o", str(out_dir)]
    print("Running:", " ".join(cmd))
    subprocess.check_call(cmd)
    zip_files = sorted(out_dir.glob("*.zip"))
    tar_files = sorted(out_dir.glob("*.tar*"))
    if zip_files or tar_files:
        print("Extracting Zenodo archives...")
    for zip_file in zip_files:
        extract_zip(zip_file, out_dir)
    for tar_file in tar_files:
        extract_tar(tar_file, out_dir)
    return out_dir


def download_pskus():
    out_dir = RAW_DIR / "pskus"
    out_dir.mkdir(parents=True, exist_ok=True)
    if PSKUS_DATASET_IDS is None:
        return download_zenodo(PSKUS_ZENODO, out_dir)
    for ds_id in PSKUS_DATASET_IDS:
        filename = f"DataSet{ds_id}.zip"
        download_with_progress(_zenodo_file_url(PSKUS_ZENODO, filename), out_dir / filename)
        extract_zip(out_dir / filename, out_dir)
    if DOWNLOAD_PSKUS_SPLIT_CSV:
        csv_name = "statistics-with-locations.csv"
        download_with_progress(_zenodo_file_url(PSKUS_ZENODO, csv_name), out_dir / csv_name)
    return out_dir

def download_metc():
    out_dir = RAW_DIR / "metc"
    out_dir.mkdir(parents=True, exist_ok=True)
    if METC_INTERFACE_IDS is None:
        return download_zenodo(METC_ZENODO, out_dir)
    for interface_id in METC_INTERFACE_IDS:
        filename = f"Interface_number_{interface_id}.zip"
        download_with_progress(_zenodo_file_url(METC_ZENODO, filename), out_dir / filename)
        extract_zip(out_dir / filename, out_dir)
    if DOWNLOAD_METC_CSV:
        for csv_name in ("summary.csv", "statistics.csv"):
            download_with_progress(_zenodo_file_url(METC_ZENODO, csv_name), out_dir / csv_name)
    return out_dir

def download_synthetic():
    out_dir = RAW_DIR / "synthetic_blender_rozakar"
    out_dir.mkdir(parents=True, exist_ok=True)
    links = SYNTHETIC_LINKS
    if SYNTHETIC_MAX_ZIPS is not None:
        links = SYNTHETIC_LINKS[:SYNTHETIC_MAX_ZIPS]
    for i, link in enumerate(links, 1):
        out_zip = out_dir / f"synth_{i}.zip"
        if not out_zip.exists():
            subprocess.check_call(["gdown", "-q", link, "-O", str(out_zip)])
        extract_zip(out_zip, out_dir)
    return out_dir

def ensure_dataset(name: str):
    raw_dir = RAW_DIR / name
    if raw_dir.exists():
        _extract_archives_if_needed(raw_dir)
        print("Using existing raw data", raw_dir)
        return raw_dir
    if name == "kaggle":
        return download_kaggle()
    if name == "pskus":
        return download_pskus()
    if name == "metc":
        return download_metc()
    if name == "synthetic_blender_rozakar":
        return download_synthetic()
    raise ValueError("Unknown dataset " + name)


In [None]:

def infer_label_from_path(p: Path) -> int:
    parts = [part for part in Path(p).parts]
    for part in reversed(parts):
        if part.isdigit():
            class_id = int(part)
            if 0 <= class_id < len(CLASS_NAMES):
                return class_id
    text = str(p).lower()
    for token, idx in LABEL_TOKENS.items():
        if token in text:
            return idx
    return 0


def _parse_int_from_text(text: str) -> int | None:
    match = re.search(r"(\d+)", text)
    return int(match.group(1)) if match else None


def infer_synthetic_class_id(path: Path) -> int | None:
    for part in path.parts:
        if "gesture" in part.lower():
            num = _parse_int_from_text(part)
            if num is None:
                continue
            return SYNTHETIC_GESTURE_TO_CLASS.get(num)
    return None


def synthetic_video_id(path: Path) -> str:
    parts = list(path.parts)
    gesture_idx = None
    for i, part in enumerate(parts):
        if part.lower().startswith("gesture"):
            gesture_idx = i
    if gesture_idx is None or gesture_idx < 2:
        return path.stem
    character = parts[gesture_idx - 2]
    environment = parts[gesture_idx - 1]
    gesture = parts[gesture_idx]
    return f"{character}_{environment}_{gesture}"


def parse_frame_idx(path: Path) -> int:
    num = _parse_int_from_text(path.stem)
    return int(num) if num is not None else 0


In [None]:

def _majority_vote(labels, total_movements):
    counts = [0] * total_movements
    for el in labels:
        counts[int(el)] += 1
    best = 0
    for i in range(1, total_movements):
        if counts[best] < counts[i]:
            best = i
    majority = (len(labels) + 2) // 2
    if counts[best] < majority:
        return -1
    return best


def _discount_reaction_indeterminacy(labels, reaction_frames):
    new_labels = [u for u in labels]
    n = len(labels) - 1
    for i in range(n):
        if i == 0 or labels[i] != labels[i + 1] or i == n - 1:
            start = max(0, i - reaction_frames)
            end = i
            for j in range(start, end):
                new_labels[j] = -1
            start = i
            end = min(n + 1, i + reaction_frames)
            for j in range(start, end):
                new_labels[j] = -1
    return new_labels


def _select_frames_to_save(is_washing, codes, movement0_prop=1.0):
    old_code = -1
    old_saved = False
    num_snippets = 0
    mapping = {}
    current_snippet = {}
    for i in range(len(is_washing)):
        new_code = codes[i]
        new_saved = (is_washing[i] == 2 and new_code != -1)
        if new_saved != old_saved:
            if new_saved:
                num_snippets += 1
                current_snippet = {}
            else:
                if old_code != 0 or np.random.rand() < movement0_prop:
                    for key in current_snippet:
                        mapping[key] = current_snippet[key]
        if new_saved:
            current_snippet_frame = len(current_snippet)
            current_snippet[i] = (current_snippet_frame, num_snippets, new_code)
        old_saved = new_saved
        old_code = new_code
    if old_saved:
        if old_code != 0 or np.random.rand() < movement0_prop:
            for key in current_snippet:
                mapping[key] = current_snippet[key]
    return mapping


def _find_annotations_dir(video_path: Path) -> Path | None:
    for parent in video_path.parents:
        ann_dir = parent / "Annotations"
        if ann_dir.exists():
            return ann_dir
    return None


def _load_frame_annotations(video_path: Path, annotator_prefix: str, total_annotators: int):
    ann_dir = _find_annotations_dir(video_path)
    if not ann_dir:
        return [], 0
    annotations = []
    for a in range(1, total_annotators + 1):
        annotator_dir = ann_dir / f"{annotator_prefix}{a}"
        json_path = annotator_dir / f"{video_path.stem}.json"
        if not json_path.exists():
            continue
        try:
            with open(json_path, "r") as f:
                data = json.load(f)
            a_annotations = [(data['labels'][i]['is_washing'], data['labels'][i]['code']) for i in range(len(data['labels']))]
            annotations.append(a_annotations)
        except Exception as exc:
            print("Failed to load", json_path, exc)
    return annotations, len(annotations)


def _frame_labels_from_annotations(annotations, total_movements, reaction_frames, code_mapping=None):
    num_annotators = len(annotations)
    if num_annotators == 0:
        return [], []
    if code_mapping is None:
        code_mapping = {i: i for i in range(total_movements)}
    num_frames = len(annotations[0])
    is_washing, codes = [], []
    for frame_num in range(num_frames):
        frame_annotations = [annotations[a][frame_num] for a in range(num_annotators)]
        frame_is_washing_any = any(frame_annotations[a][0] for a in range(num_annotators))
        frame_is_washing_all = all(frame_annotations[a][0] for a in range(num_annotators))
        frame_codes = [frame_annotations[a][1] for a in range(num_annotators)]
        frame_codes = [code_mapping.get(int(code), 0) for code in frame_codes]
        if frame_is_washing_all:
            frame_is_washing = 2
        elif frame_is_washing_any:
            frame_is_washing = 1
        else:
            frame_is_washing = 0
        is_washing.append(frame_is_washing)
        if frame_is_washing:
            codes.append(_majority_vote(frame_codes, total_movements))
        else:
            codes.append(-1)
    is_washing = _discount_reaction_indeterminacy(is_washing, reaction_frames)
    codes = _discount_reaction_indeterminacy(codes, reaction_frames)
    return is_washing, codes


In [None]:

def _find_pskus_split_csv(raw_dir: Path):
    csv_path = raw_dir / "statistics-with-locations.csv"
    if csv_path.exists():
        return csv_path
    candidates = [
        Path.cwd() / "code/edgewash/dataset-pskus/statistics-with-locations.csv",
        Path.cwd() / "edgeWash/code/edgewash/dataset-pskus/statistics-with-locations.csv",
    ]
    for candidate in candidates:
        if candidate.exists():
            return candidate
    return None


def _load_pskus_split(pskus_dir: Path):
    csv_path = pskus_dir / "statistics-with-locations.csv"
    if not csv_path.exists():
        fallback = _find_pskus_split_csv(pskus_dir)
        if fallback is not None:
            csv_path = fallback
            print("Using fallback PSKUS split file:", csv_path)
    if not csv_path.exists():
        print("PSKUS split CSV not found; will use random split later")
        return set(), set()
    testfiles, trainvalfiles = set(), set()
    try:
        import csv as csv_lib
        with open(csv_path, "r") as csv_file:
            reader = csv_lib.reader(csv_file)
            for row in reader:
                if row and row[0] == "filename":
                    continue
                if not row:
                    continue
                filename = row[0]
                location = row[1] if len(row) > 1 else ""
                if location == "Reanim\u0101cija":
                    testfiles.add(filename)
                elif location != "unknown":
                    trainvalfiles.add(filename)
    except Exception as exc:
        print("Failed to read PSKUS split CSV", csv_path, exc)
    return testfiles, trainvalfiles


In [None]:

def extract_frames_from_video(video_path: Path, out_dir: Path, frame_skip: int) -> List[Dict]:
    rows = []
    cap = cv2.VideoCapture(str(video_path))
    if not cap.isOpened():
        return rows
    base = video_path.stem
    label = infer_label_from_path(video_path)
    idx = 0
    frame_idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % frame_skip == 0:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, IMG_SIZE)
            out_path = out_dir / f"{base}_{idx:06d}.jpg"
            cv2.imwrite(str(out_path), frame[:, :, ::-1])
            rows.append({"frame_path": str(out_path), "class_id": label, "video_id": base, "frame_idx": idx})
            idx += 1
            if MAX_FRAMES_PER_VIDEO is not None and idx >= MAX_FRAMES_PER_VIDEO:
                break
        frame_idx += 1
    cap.release()
    return rows


def preprocess_images(image_paths: List[Path], out_dir: Path) -> List[Dict]:
    rows = []
    out_dir.mkdir(parents=True, exist_ok=True)
    for img_path in tqdm(image_paths, desc="images"):
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, IMG_SIZE)
        label = infer_label_from_path(img_path)
        out_path = out_dir / f"{img_path.stem}.jpg"
        cv2.imwrite(str(out_path), img[:, :, ::-1])
        rows.append({"frame_path": str(out_path), "class_id": label, "video_id": img_path.parent.name, "frame_idx": 0})
    return rows


def _split_train_val_by_video(df, train_ratio=0.7, val_ratio=0.15):
    unique_videos = df["video_id"].unique()
    video_to_class = df.groupby("video_id")["class_id"].first()
    val_size = val_ratio / (train_ratio + val_ratio)
    train_videos, val_videos = train_test_split(
        unique_videos,
        test_size=val_size,
        random_state=42,
        stratify=video_to_class[unique_videos],
    )
    train_df = df[df["video_id"].isin(train_videos)].reset_index(drop=True)
    val_df = df[df["video_id"].isin(val_videos)].reset_index(drop=True)
    return train_df, val_df


def split_and_save(df: pd.DataFrame, out_dir: Path) -> Dict[str, Path]:
    if "split" in df.columns and df["split"].notna().any():
        test_df = df[df["split"] == "test"].reset_index(drop=True)
        trainval_df = df[df["split"] != "test"].reset_index(drop=True)
        if not trainval_df.empty:
            train_df, val_df = _split_train_val_by_video(trainval_df)
        else:
            train_df, val_df = df, df.iloc[0:0].copy()
    else:
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)
        n = len(df)
        train_end = int(0.7 * n)
        val_end = int(0.85 * n)
        train_df, val_df, test_df = df.iloc[:train_end], df.iloc[train_end:val_end], df.iloc[val_end:]
    out_dir.mkdir(parents=True, exist_ok=True)
    train_csv = out_dir / "train.csv"
    val_csv = out_dir / "val.csv"
    test_csv = out_dir / "test.csv"
    train_df.to_csv(train_csv, index=False)
    val_df.to_csv(val_csv, index=False)
    test_df.to_csv(test_csv, index=False)
    return {"train": train_csv, "val": val_csv, "test": test_csv}


In [None]:

def preprocess_pskus_dataset(pskus_dir: Path, frames_root: Path) -> pd.DataFrame:
    rows = []
    testfiles, trainvalfiles = _load_pskus_split(pskus_dir)
    has_split = bool(testfiles or trainvalfiles)
    movement0_prop = 0.2
    total_annotators = 8
    total_movements = 8
    fps = 30
    reaction_frames = fps // 2

    for video_path in pskus_dir.rglob("*.mp4"):
        filename = video_path.name
        if has_split:
            if filename in testfiles:
                split = "test"
            elif filename in trainvalfiles:
                split = "trainval"
            else:
                continue
        else:
            split = None

        annotations, num_annotators = _load_frame_annotations(video_path, "Annotator", total_annotators)
        if num_annotators <= 1:
            continue
        is_washing, codes = _frame_labels_from_annotations(
            annotations, total_movements, reaction_frames, code_mapping=PSKUS_CODE_MAPPING
        )
        mapping = _select_frames_to_save(is_washing, codes, movement0_prop)
        if not mapping:
            continue
        frames_dir = frames_root / (split or "trainval")
        vidcap = cv2.VideoCapture(str(video_path))
        is_success, image = vidcap.read()
        frame_number = 0
        saved = 0
        while is_success:
            if frame_number in mapping:
                new_frame_num, snippet_num, code = mapping[frame_number]
                out_sub = frames_dir / str(code)
                out_sub.mkdir(parents=True, exist_ok=True)
                filename_out = f"frame_{new_frame_num}_snippet_{snippet_num}_{video_path.stem}.jpg"
                save_path = out_sub / filename_out
                image_resized = cv2.resize(image, IMG_SIZE)
                cv2.imwrite(str(save_path), image_resized)
                row = {
                    "frame_path": str(save_path),
                    "class_id": int(code),
                    "video_id": video_path.stem,
                    "frame_idx": new_frame_num,
                }
                if split:
                    row["split"] = split
                rows.append(row)
                saved += 1
                if MAX_FRAMES_PER_VIDEO is not None and saved >= MAX_FRAMES_PER_VIDEO:
                    break
            is_success, image = vidcap.read()
            frame_number += 1
        vidcap.release()
    return pd.DataFrame(rows)


def preprocess_metc_dataset(metc_dir: Path, frames_root: Path) -> pd.DataFrame:
    rows = []
    total_annotators = 1
    total_movements = 7
    fps = 16
    reaction_frames = fps // 2
    test_proportion = 0.25
    for video_path in metc_dir.rglob("*.mp4"):
        split = "test" if np.random.rand() < test_proportion else "trainval"
        annotations, num_annotators = _load_frame_annotations(video_path, "Annotator_", total_annotators)
        if num_annotators == 0:
            continue
        is_washing, codes = _frame_labels_from_annotations(
            annotations, total_movements, reaction_frames, code_mapping=METC_CODE_MAPPING
        )
        mapping = _select_frames_to_save(is_washing, codes, movement0_prop=1.0)
        if not mapping:
            continue
        frames_dir = frames_root / split
        vidcap = cv2.VideoCapture(str(video_path))
        is_success, image = vidcap.read()
        frame_number = 0
        saved = 0
        while is_success:
            if frame_number in mapping:
                new_frame_num, snippet_num, code = mapping[frame_number]
                out_sub = frames_dir / str(code)
                out_sub.mkdir(parents=True, exist_ok=True)
                filename_out = f"frame_{new_frame_num}_snippet_{snippet_num}_{video_path.stem}.jpg"
                save_path = out_sub / filename_out
                image_resized = cv2.resize(image, IMG_SIZE)
                cv2.imwrite(str(save_path), image_resized)
                rows.append({
                    "frame_path": str(save_path),
                    "class_id": int(code),
                    "video_id": video_path.stem,
                    "frame_idx": new_frame_num,
                    "split": split,
                })
                saved += 1
                if MAX_FRAMES_PER_VIDEO is not None and saved >= MAX_FRAMES_PER_VIDEO:
                    break
            is_success, image = vidcap.read()
            frame_number += 1
        vidcap.release()
    return pd.DataFrame(rows)


def preprocess_synthetic_dataset(raw_dir: Path, frames_root: Path) -> pd.DataFrame:
    rows = []
    frames_root.mkdir(parents=True, exist_ok=True)
    image_paths = [p for p in raw_dir.rglob("*.png") if p.is_file()]
    for img_path in tqdm(image_paths, desc="synthetic"):
        if "rgb" not in [part.lower() for part in img_path.parts]:
            continue
        class_id = infer_synthetic_class_id(img_path)
        if class_id is None:
            continue
        img = cv2.imread(str(img_path))
        if img is None:
            continue
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, IMG_SIZE)
        video_id = synthetic_video_id(img_path)
        frame_idx = parse_frame_idx(img_path)
        out_sub = frames_root / str(class_id)
        out_sub.mkdir(parents=True, exist_ok=True)
        out_path = out_sub / f"{video_id}_{frame_idx:06d}.jpg"
        cv2.imwrite(str(out_path), img[:, :, ::-1])
        rows.append({
            "frame_path": str(out_path),
            "class_id": int(class_id),
            "video_id": video_id,
            "frame_idx": int(frame_idx),
        })
    return pd.DataFrame(rows)


def preprocess_dataset(name: str) -> Path:
    raw_dir = RAW_DIR / name
    out_dir = PROCESSED_DIR / name
    frames_dir = out_dir / "frames"
    frames_dir.mkdir(parents=True, exist_ok=True)

    if name == "pskus":
        df = preprocess_pskus_dataset(raw_dir, frames_dir)
    elif name == "metc":
        df = preprocess_metc_dataset(raw_dir, frames_dir)
    elif name == "synthetic_blender_rozakar":
        df = preprocess_synthetic_dataset(raw_dir, frames_dir)
    else:
        video_files = [p for p in raw_dir.rglob("*") if p.suffix.lower() in VIDEO_EXTS]
        image_files = [p for p in raw_dir.rglob("*") if p.suffix.lower() in IMAGE_EXTS]
        rows = []
        if video_files:
            if MAX_VIDEOS_PER_DATASET is not None:
                video_files = video_files[:MAX_VIDEOS_PER_DATASET]
            for vp in tqdm(video_files, desc="videos"):
                rows.extend(extract_frames_from_video(vp, frames_dir, FRAME_SKIP))
        elif image_files:
            rows.extend(preprocess_images(image_files, frames_dir))
        else:
            raise RuntimeError("No video or image files found in " + str(raw_dir))
        df = pd.DataFrame(rows)

    if df.empty:
        raise RuntimeError("No frames extracted for " + name)
    split_and_save(df, out_dir)
    return out_dir


In [None]:

def _is_archive(path: Path) -> bool:
    name = path.name.lower()
    return name.endswith((".zip", ".tar", ".tar.gz", ".tgz", ".tar.bz2", ".tar.xz"))


def _collect_archives(root: Path, max_hits: int = 5):
    hits = []
    if not root.exists():
        return hits
    for path in root.rglob("*"):
        if path.is_file() and _is_archive(path):
            hits.append(path)
            if len(hits) >= max_hits:
                break
    return hits




def cleanup_archives(root: Path, log_fp=None):
    archives = []
    if root.exists():
        for path in root.rglob("*"):
            if path.is_file() and _is_archive(path):
                archives.append(path)
    if not archives:
        return
    for path in archives:
        try:
            path.unlink()
        except Exception:
            pass
    try:
        _log_line(log_fp, f"Removed {len(archives)} archive files from {root}")
    except Exception:
        print(f"Removed {len(archives)} archive files from {root}")

def _iter_files(root: Path, exts, max_hits: int = 3):
    hits = []
    if not root.exists():
        return hits
    for path in root.rglob("*"):
        if path.is_file() and path.suffix.lower() in exts:
            hits.append(path)
            if len(hits) >= max_hits:
                break
    return hits


def validate_raw_dataset(name: str, raw_dir: Path, strict_archives: bool = False) -> None:
    errors = []
    if not raw_dir.exists():
        errors.append("raw dir missing")
    elif name == "kaggle":
        kaggle_root = raw_dir / "kaggle-dataset-6classes"
        if not kaggle_root.exists():
            errors.append("kaggle-dataset-6classes folder missing")
        if not _iter_files(kaggle_root, VIDEO_EXTS):
            errors.append("no videos found under kaggle-dataset-6classes")
    elif name == "pskus":
        dataset_dirs = [p for p in raw_dir.rglob("DataSet*") if p.is_dir()]
        if not dataset_dirs:
            errors.append("no DataSet* folders found")
        if not _iter_files(raw_dir, VIDEO_EXTS):
            errors.append("no videos found")
        if not _iter_files(raw_dir, (".json",), max_hits=1):
            errors.append("no annotation JSON files found")
        if _find_pskus_split_csv(raw_dir) is None:
            print("WARNING: statistics-with-locations.csv not found; will use random split.")
    elif name == "metc":
        interface_dirs = [p for p in raw_dir.rglob("Interface_number_*") if p.is_dir()]
        if not interface_dirs:
            errors.append("no Interface_number_* folders found")
        if not _iter_files(raw_dir, VIDEO_EXTS):
            errors.append("no videos found")
        if not _iter_files(raw_dir, (".json",), max_hits=1):
            errors.append("no annotation JSON files found")
    elif name == "synthetic_blender_rozakar":
        pngs = _iter_files(raw_dir, (".png",), max_hits=3)
        if not pngs:
            errors.append("no PNG files found")
    else:
        errors.append("unknown dataset name")

    archives = _collect_archives(raw_dir)
    if archives:
        msg = "archive files still present: " + ", ".join([p.name for p in archives])
        if strict_archives:
            errors.append(msg)
        else:
            print("WARN:", msg)

    if errors:
        raise RuntimeError(f"Raw dataset validation failed for {name}: " + "; ".join(errors))


def validate_processed_dataset(out_dir: Path, max_rows: int = 20) -> None:
    errors = []
    for split in ("train", "val", "test"):
        csv_path = out_dir / f"{split}.csv"
        if not csv_path.exists():
            errors.append(f"missing {split}.csv")
            continue
        df = pd.read_csv(csv_path).head(max_rows)
        if df.empty:
            errors.append(f"{split}.csv has no rows")
            continue
        required = {"frame_path", "class_id", "video_id", "frame_idx"}
        missing = required - set(df.columns)
        if missing:
            errors.append(f"{split}.csv missing columns: {sorted(missing)}")
            continue
        for row in df.itertuples():
            frame_path = Path(row.frame_path)
            if not frame_path.exists():
                errors.append(f"missing frame file: {frame_path}")
                break
            if not (0 <= int(row.class_id) < NUM_CLASSES):
                errors.append(f"class_id out of range: {row.class_id}")
                break
    if errors:
        raise RuntimeError("Processed dataset validation failed: " + "; ".join(errors))


In [None]:
def _safe_slug(text: str) -> str:
    return re.sub(r"[^A-Za-z0-9]+", "_", text).strip("_").lower()


def _log_line(log_fp, text: str):
    print(text)
    if log_fp:
        log_fp.write(text + "\n")
        log_fp.flush()


def _dir_size_bytes(path: Path) -> int:
    total = 0
    if not path.exists():
        return total
    for p in path.rglob("*"):
        if p.is_file():
            total += p.stat().st_size
    return total


def _count_files_with_ext(root: Path, exts) -> int:
    count = 0
    if not root.exists():
        return count
    for p in root.rglob("*"):
        if p.is_file() and p.suffix.lower() in exts:
            count += 1
    return count


def show_sample_videos(raw_dir: Path, max_videos: int = 2, log_fp=None):
    videos = [p for p in raw_dir.rglob("*") if p.suffix.lower() in VIDEO_EXTS]
    if not videos:
        _log_line(log_fp, "No videos found")
        return
    for vp in videos[:max_videos]:
        _log_line(log_fp, f"Video sample: {vp}")
        display(Video(str(vp), embed=True))


def plot_class_distribution(df: pd.DataFrame, title: str, save_path: Path | None = None, show: bool = True):
    if df.empty:
        print(f"No data for {title}")
        return
    counts = df["class_id"].value_counts().sort_index()
    labels = [CLASS_NAMES[i] if i < len(CLASS_NAMES) else str(i) for i in counts.index]
    fig = plt.figure(figsize=(10, 4))
    sns.barplot(x=labels, y=counts.values)
    plt.xticks(rotation=45, ha="right")
    plt.title(title)
    plt.tight_layout()
    if save_path is not None:
        fig.savefig(save_path, dpi=150)
    if show:
        plt.show()
    else:
        plt.close(fig)


def show_samples_by_class(df: pd.DataFrame, title: str, n_per_class: int = 3,
                          save_path: Path | None = None, show: bool = True):
    fig = plt.figure(figsize=(3 * n_per_class, 2.5 * NUM_CLASSES))
    idx = 1
    for class_id in range(NUM_CLASSES):
        subset = df[df["class_id"] == class_id]
        if subset.empty:
            for _ in range(n_per_class):
                plt.subplot(NUM_CLASSES, n_per_class, idx)
                plt.text(0.5, 0.5, f"No samples\n{CLASS_NAMES[class_id]}", ha="center", va="center")
                plt.axis("off")
                idx += 1
            continue
        sample = subset.sample(min(n_per_class, len(subset)), replace=False, random_state=42)
        for _, row in sample.iterrows():
            img = cv2.imread(row.frame_path)
            if img is None:
                plt.subplot(NUM_CLASSES, n_per_class, idx)
                plt.text(0.5, 0.5, "missing", ha="center", va="center")
                plt.axis("off")
                idx += 1
                continue
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            plt.subplot(NUM_CLASSES, n_per_class, idx)
            plt.imshow(img)
            plt.title(CLASS_NAMES[class_id], fontsize=8)
            plt.axis("off")
            idx += 1
        while idx % n_per_class != 1:
            plt.subplot(NUM_CLASSES, n_per_class, idx)
            plt.axis("off")
            idx += 1
    plt.suptitle(title)
    plt.tight_layout()
    if save_path is not None:
        fig.savefig(save_path, dpi=150)
    if show:
        plt.show()
    else:
        plt.close(fig)


def log_dataframe_info(df: pd.DataFrame, label: str, log_fp=None) -> None:
    if log_fp is None:
        return
    log_fp.write(f"\n{label} info:\n")
    df.info(buf=log_fp)
    log_fp.write("\n")
    log_fp.write(f"{label} describe:\n")
    try:
        log_fp.write(df.describe(include="all").to_string())
    except Exception:
        log_fp.write(df.describe().to_string())
    log_fp.write("\n")
    log_fp.flush()


def print_dataset_info(df: pd.DataFrame, label: str, log_fp=None) -> None:
    _log_line(log_fp, f"{label} samples: {len(df)}")
    if "video_id" in df.columns:
        _log_line(log_fp, f"{label} videos: {df['video_id'].nunique()}")
    if "frame_idx" in df.columns and not df.empty:
        _log_line(log_fp, f"{label} frame_idx range: {df['frame_idx'].min()}..{df['frame_idx'].max()}")
    if "class_id" in df.columns and not df.empty:
        _log_line(log_fp, f"{label} class_id range: {df['class_id'].min()}..{df['class_id'].max()}")


def log_dataset_sizes(raw_dir: Path, out_dir: Path, train_df: pd.DataFrame,
                      val_df: pd.DataFrame, test_df: pd.DataFrame,
                      log_fp=None, log_dir: Path | None = None) -> Dict:
    combined = pd.concat([train_df, val_df, test_df], ignore_index=True)
    stats = {
        "raw_dir": str(raw_dir),
        "processed_dir": str(out_dir),
        "raw_video_count": _count_files_with_ext(raw_dir, VIDEO_EXTS),
        "raw_image_count": _count_files_with_ext(raw_dir, IMAGE_EXTS),
        "train_rows": int(len(train_df)),
        "val_rows": int(len(val_df)),
        "test_rows": int(len(test_df)),
        "processed_frame_count": int(len(combined)),
        "processed_unique_videos": int(combined['video_id'].nunique()) if 'video_id' in combined.columns else 0,
    }
    if CALCULATE_DISK_USAGE:
        stats["raw_size_mb"] = round(_dir_size_bytes(raw_dir) / (1024 * 1024), 2)
        stats["processed_size_mb"] = round(_dir_size_bytes(out_dir) / (1024 * 1024), 2)
    _log_line(log_fp, "Dataset size summary:")
    for key, value in stats.items():
        _log_line(log_fp, f"{key}: {value}")
    if log_dir is not None:
        stats_path = log_dir / "dataset_stats.json"
        with open(stats_path, "w") as f:
            json.dump(stats, f, indent=2)
    return stats


def save_sample_frames_by_class(df: pd.DataFrame, out_dir: Path, n_per_class: int,
                                log_fp=None) -> pd.DataFrame:
    out_dir.mkdir(parents=True, exist_ok=True)
    saved_rows = []
    for class_id in range(NUM_CLASSES):
        subset = df[df["class_id"] == class_id]
        if subset.empty:
            _log_line(log_fp, f"No samples for class {class_id} ({CLASS_NAMES[class_id]})")
            continue
        sample = subset.sample(min(n_per_class, len(subset)), random_state=42)
        class_dir = out_dir / f"{class_id}_{_safe_slug(CLASS_NAMES[class_id])}"
        class_dir.mkdir(parents=True, exist_ok=True)
        for _, row in sample.iterrows():
            src = Path(row.frame_path)
            if not src.exists():
                continue
            dst_name = f"{row.video_id}_{int(row.frame_idx):06d}{src.suffix}"
            dst = class_dir / dst_name
            try:
                shutil.copy2(src, dst)
            except Exception:
                continue
            saved_rows.append({
                "class_id": int(class_id),
                "class_name": CLASS_NAMES[class_id],
                "frame_path": str(dst),
                "source_frame": str(src),
                "video_id": str(row.video_id),
                "frame_idx": int(row.frame_idx),
            })
    return pd.DataFrame(saved_rows)


def save_sample_videos_by_class(df: pd.DataFrame, out_dir: Path, frames_per_video: int,
                                fps: int, log_fp=None) -> None:
    out_dir.mkdir(parents=True, exist_ok=True)
    for class_id in range(NUM_CLASSES):
        subset = df[df["class_id"] == class_id]
        if subset.empty:
            continue
        counts = subset.groupby("video_id").size().sort_values(ascending=False)
        if counts.empty:
            continue
        best_video = counts.index[0]
        frames_df = subset[subset["video_id"] == best_video].sort_values("frame_idx").head(frames_per_video)
        if frames_df.empty:
            continue
        first = cv2.imread(frames_df.iloc[0].frame_path)
        if first is None:
            continue
        height, width = first.shape[:2]
        out_path = out_dir / f"class_{class_id}_{_safe_slug(CLASS_NAMES[class_id])}.mp4"
        writer = cv2.VideoWriter(str(out_path), cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height))
        written = 0
        for _, row in frames_df.iterrows():
            img = cv2.imread(row.frame_path)
            if img is None:
                continue
            if img.shape[:2] != (height, width):
                img = cv2.resize(img, (width, height))
            writer.write(img)
            written += 1
        writer.release()
        if written == 0:
            out_path.unlink(missing_ok=True)
        else:
            _log_line(log_fp, f"Saved class video: {out_path} ({written} frames)")


def _infer_raw_label_token(path: Path) -> tuple[str | None, int | None]:
    for part in reversed(path.parts):
        lower = part.lower()
        if part.isdigit():
            val = int(part)
            if 0 <= val < NUM_CLASSES:
                return part, val
        for token, idx in LABEL_TOKENS.items():
            if token in lower:
                return part, idx
    return None, None


def collect_dataset_label_mapping(name: str, raw_dir: Path) -> pd.DataFrame:
    rows = []
    if name == "kaggle":
        counts = {}
        for path in raw_dir.rglob("*"):
            if not path.is_file():
                continue
            if path.suffix.lower() not in VIDEO_EXTS + IMAGE_EXTS:
                continue
            raw_label, mapped = _infer_raw_label_token(path)
            if raw_label is None or mapped is None:
                continue
            key = (str(raw_label), int(mapped))
            counts[key] = counts.get(key, 0) + 1
        for (raw_label, mapped), count in sorted(counts.items(), key=lambda item: (item[0][1], item[0][0])):
            rows.append({
                "dataset": name,
                "raw_label": raw_label,
                "mapped_class_id": int(mapped),
                "class_name": CLASS_NAMES[mapped],
                "count": int(count),
            })
    elif name == "pskus":
        for raw_label, mapped in sorted(PSKUS_CODE_MAPPING.items()):
            rows.append({
                "dataset": name,
                "raw_label": str(raw_label),
                "mapped_class_id": int(mapped),
                "class_name": CLASS_NAMES[mapped],
                "count": None,
            })
    elif name == "metc":
        for raw_label, mapped in sorted(METC_CODE_MAPPING.items()):
            rows.append({
                "dataset": name,
                "raw_label": str(raw_label),
                "mapped_class_id": int(mapped),
                "class_name": CLASS_NAMES[mapped],
                "count": None,
            })
    elif name == "synthetic_blender_rozakar":
        for raw_label, mapped in sorted(SYNTHETIC_GESTURE_TO_CLASS.items()):
            rows.append({
                "dataset": name,
                "raw_label": str(raw_label),
                "mapped_class_id": int(mapped),
                "class_name": CLASS_NAMES[mapped],
                "count": None,
            })
    return pd.DataFrame(rows)


def summarize_class_mapping(mapping_df: pd.DataFrame) -> pd.DataFrame:
    if mapping_df.empty:
        return mapping_df
    datasets = sorted(mapping_df["dataset"].unique())
    rows = []
    for class_id, class_name in enumerate(CLASS_NAMES):
        row = {"class_id": class_id, "class_name": class_name}
        for ds in datasets:
            subset = mapping_df[(mapping_df["dataset"] == ds) & (mapping_df["mapped_class_id"] == class_id)]
            labels = sorted({str(v) for v in subset["raw_label"].dropna().tolist()})
            row[ds] = ", ".join(labels) if labels else "-"
        rows.append(row)
    return pd.DataFrame(rows)


def show_processed_summary(train_df, val_df, test_df, dataset_name: str,
                           log_dir: Path | None = None, log_fp=None):
    print_dataset_info(train_df, f"{dataset_name} train", log_fp)
    print_dataset_info(val_df, f"{dataset_name} val", log_fp)
    print_dataset_info(test_df, f"{dataset_name} test", log_fp)

    combined = pd.concat([train_df, val_df, test_df], ignore_index=True)
    plots_dir = None
    if log_dir is not None:
        plots_dir = log_dir / "plots"
        plots_dir.mkdir(parents=True, exist_ok=True)

    plot_class_distribution(combined, f"{dataset_name} class distribution (all splits)",
                            save_path=plots_dir / "class_distribution_all.png" if plots_dir else None,
                            show=True)
    plot_class_distribution(train_df, f"{dataset_name} class distribution (train)",
                            save_path=plots_dir / "class_distribution_train.png" if plots_dir else None,
                            show=True)
    plot_class_distribution(val_df, f"{dataset_name} class distribution (val)",
                            save_path=plots_dir / "class_distribution_val.png" if plots_dir else None,
                            show=True)
    plot_class_distribution(test_df, f"{dataset_name} class distribution (test)",
                            save_path=plots_dir / "class_distribution_test.png" if plots_dir else None,
                            show=True)

    counts = combined["class_id"].value_counts().sort_index()
    counts_df = pd.DataFrame({
        "class_id": counts.index,
        "class_name": [CLASS_NAMES[i] if i < len(CLASS_NAMES) else str(i) for i in counts.index],
        "count": counts.values,
    })
    if log_dir is not None:
        counts_df.to_csv(log_dir / "class_distribution.csv", index=False)

    show_samples_by_class(combined, f"{dataset_name} samples by class",
                          n_per_class=SAMPLES_PER_CLASS,
                          save_path=plots_dir / "samples_grid.png" if plots_dir else None,
                          show=True)

    if SAVE_SAMPLE_MEDIA and log_dir is not None:
        frames_dir = log_dir / "samples" / "frames"
        videos_dir = log_dir / "samples" / "videos"
        frames_dir.mkdir(parents=True, exist_ok=True)
        videos_dir.mkdir(parents=True, exist_ok=True)
        saved_df = save_sample_frames_by_class(combined, frames_dir, SAMPLES_PER_CLASS, log_fp)
        if not saved_df.empty:
            saved_df.to_csv(log_dir / "samples" / "sample_frames.csv", index=False)
        save_sample_videos_by_class(combined, videos_dir, SAMPLE_VIDEO_FRAMES, SAMPLE_VIDEO_FPS, log_fp)

    mapping_df = pd.DataFrame({"class_id": list(range(NUM_CLASSES)), "class_name": CLASS_NAMES})
    _log_line(log_fp, "Class mapping:")
    if log_fp:
        log_fp.write(mapping_df.to_string(index=False) + "\n")
    if log_dir is not None:
        mapping_df.to_csv(log_dir / "class_mapping_canonical.csv", index=False)
    display(mapping_df)


In [None]:
def cleanup_dataset_artifacts(name: str, raw_dir: Path, out_dir: Path, log_fp=None) -> None:
    if not CLEANUP_BETWEEN_DATASETS:
        return
    _log_line(log_fp, f"Cleaning up dataset artifacts for {name}")
    shutil.rmtree(raw_dir, ignore_errors=True)
    shutil.rmtree(out_dir, ignore_errors=True)


def process_and_validate_dataset(name: str):
    _log_line(None, f"\n=== Processing {name} ===")
    log_dir = LOG_DIR / name if SAVE_LOGS else None
    log_fp = None
    raw_dir = None
    out_dir = None
    success = False

    if log_dir is not None:
        log_dir.mkdir(parents=True, exist_ok=True)
        log_fp = open(log_dir / "validation.log", "w")

    try:
        raw_dir = ensure_dataset(name)
        if CLEANUP_ARCHIVES:
            cleanup_archives(raw_dir, log_fp)
        validate_raw_dataset(name, raw_dir)
        show_sample_videos(raw_dir, max_videos=RAW_SAMPLE_VIDEOS, log_fp=log_fp)

        out_dir = PROCESSED_DIR / name
        train_csv = out_dir / "train.csv"
        val_csv = out_dir / "val.csv"
        test_csv = out_dir / "test.csv"
        if not (train_csv.exists() and val_csv.exists() and test_csv.exists()):
            preprocess_dataset(name)
        validate_processed_dataset(out_dir)

        train_df = pd.read_csv(train_csv)
        val_df = pd.read_csv(val_csv)
        test_df = pd.read_csv(test_csv)

        log_dataset_sizes(raw_dir, out_dir, train_df, val_df, test_df, log_fp, log_dir)
        log_dataframe_info(train_df, f"{name} train", log_fp)
        log_dataframe_info(val_df, f"{name} val", log_fp)
        log_dataframe_info(test_df, f"{name} test", log_fp)

        show_processed_summary(train_df, val_df, test_df, name, log_dir=log_dir, log_fp=log_fp)

        mapping_df = collect_dataset_label_mapping(name, raw_dir)
        if mapping_df is not None and not mapping_df.empty:
            if log_dir is not None:
                mapping_df.to_csv(log_dir / "class_mapping.csv", index=False)
            display(mapping_df)
            ALL_MAPPINGS.append(mapping_df)
        success = True
    finally:
        if CLEANUP_BETWEEN_DATASETS and raw_dir is not None and out_dir is not None:
            if success or CLEANUP_ON_FAILURE:
                cleanup_dataset_artifacts(name, raw_dir, out_dir, log_fp)
        if log_fp is not None:
            log_fp.close()


for ds in DATASETS:
    process_and_validate_dataset(ds)

if ALL_MAPPINGS:
    all_mapping_df = pd.concat(ALL_MAPPINGS, ignore_index=True)
    summary_df = summarize_class_mapping(all_mapping_df)
    display(summary_df)
    if SAVE_LOGS:
        all_mapping_df.to_csv(LOG_DIR / "class_mapping_all.csv", index=False)
        summary_df.to_csv(LOG_DIR / "class_mapping_summary.csv", index=False)
