# Dataset Overview

## Data Layout Discovery

In [5]:
from pathlib import Path
DATA_ROOT = Path("/Users/nanzhu/code/Isaac-GR00T/demo_data/cube_to_bowl_5")
CHUNK_GLOB = "chunk-*"

In [6]:
import re
from textwrap import indent
from dataclasses import dataclass, asdict
from typing import Dict, List, Optional, Tuple

@dataclass
class DatasetLayout:
    root: Path
    meta_dir: Path
    data_dir: Path
    videos_dir: Path
    chunks: List[str]

    # episodes
    parquet_files: List[Path]
    episode_indices: List[int]
    episode_to_parquet: Dict[int, Path]

    # cameras / video keys
    video_keys: List[str]
    # mapping: video_key -> list of mp4 files
    video_files: Dict[str, List[Path]]

    # meta files
    meta_files: Dict[str, Path]  # filename -> path

def _parse_episode_index(p: Path) -> Optional[int]:
    # episode_000123.parquet or episode_000123.mp4
    m = re.search(r"episode_(\d+)\.(parquet|mp4)$", p.name)
    return int(m.group(1)) if m else None

def discover_layout(root: Path, chunk_glob: str = "chunk-*") -> DatasetLayout:
    if not root.exists():
        raise FileNotFoundError(f"DATA_ROOT not found: {root}")

    meta_dir = root / "meta"
    data_dir = root / "data"
    videos_dir = root / "videos"

    # chunks: union from data/chunk-* and videos/chunk-*
    data_chunks = sorted([p.name for p in data_dir.glob(chunk_glob) if p.is_dir()]) if data_dir.exists() else []
    video_chunks = sorted([p.name for p in videos_dir.glob(chunk_glob) if p.is_dir()]) if videos_dir.exists() else []
    chunks = sorted(set(data_chunks + video_chunks))

    # parquet files across chunks
    parquet_files = []
    for ch in chunks:
        parquet_files.extend(sorted((data_dir / ch).glob("episode_*.parquet")))
    parquet_files = sorted(parquet_files)

    episode_indices = []
    episode_to_parquet = {}
    for pq in parquet_files:
        ei = _parse_episode_index(pq)
        if ei is None:
            continue
        episode_indices.append(ei)
        episode_to_parquet[ei] = pq
    episode_indices = sorted(set(episode_indices))

    # discover video keys: videos/chunk-xxx/<video_key>/episode_*.mp4
    video_keys = []
    video_files: Dict[str, List[Path]] = {}
    for ch in chunks:
        ch_dir = videos_dir / ch
        if not ch_dir.exists():
            continue
        for vk_dir in sorted([p for p in ch_dir.iterdir() if p.is_dir()]):
            vk = vk_dir.name
            video_keys.append(vk)
            video_files.setdefault(vk, [])
            video_files[vk].extend(sorted(vk_dir.glob("episode_*.mp4")))

    video_keys = sorted(set(video_keys))
    for vk in video_keys:
        video_files[vk] = sorted(video_files.get(vk, []))

    # meta files
    meta_files = {}
    if meta_dir.exists():
        for p in sorted(meta_dir.iterdir()):
            if p.is_file():
                meta_files[p.name] = p

    return DatasetLayout(
        root=root,
        meta_dir=meta_dir,
        data_dir=data_dir,
        videos_dir=videos_dir,
        chunks=chunks,
        parquet_files=parquet_files,
        episode_indices=episode_indices,
        episode_to_parquet=episode_to_parquet,
        video_keys=video_keys,
        video_files=video_files,
        meta_files=meta_files,
    )



def pretty_layout_summary(layout: DatasetLayout, preview_n: int = 5):
    lines = []

    lines.append("üì¶ DATASET ROOT")
    lines.append(f"  ‚Ä¢ Path: {layout.root}")

    lines.append("\nüß© CHUNKS (data/video shards)")
    if layout.chunks:
        lines.append(f"  ‚Ä¢ Found {len(layout.chunks)} chunks:")
        for ch in layout.chunks:
            lines.append(f"    - {ch}")
    else:
        lines.append("  ‚ö†Ô∏è No chunks found")

    lines.append("\nüéûÔ∏è EPISODES (from parquet files)")
    lines.append(f"  ‚Ä¢ Total episodes discovered: {len(layout.episode_indices)}")
    if layout.episode_indices:
        preview = layout.episode_indices[:preview_n]
        lines.append(f"  ‚Ä¢ Episode index preview: {preview}"
                     + ("" if len(layout.episode_indices) <= preview_n else " ..."))

    lines.append("\nüìπ VIDEO KEYS (camera / image modalities)")
    lines.append(f"  ‚Ä¢ Total video keys (cameras): {len(layout.video_keys)}")
    if layout.video_keys:
        preview = layout.video_keys[:preview_n]
        lines.append(f"  ‚Ä¢ Video key preview: {preview}"
                     + ("" if len(layout.video_keys) <= preview_n else " ..."))

    lines.append("\nüìÅ META FILES")
    if layout.meta_files:
        lines.append(f"  ‚Ä¢ Meta files found: {list(layout.meta_files.keys())}")
    else:
        lines.append("  ‚ö†Ô∏è No meta files found")

    return "\n".join(lines)


layout = discover_layout(DATA_ROOT, CHUNK_GLOB)
print(pretty_layout_summary(layout))


üì¶ DATASET ROOT
  ‚Ä¢ Path: /Users/nanzhu/code/Isaac-GR00T/demo_data/cube_to_bowl_5

üß© CHUNKS (data/video shards)
  ‚Ä¢ Found 1 chunks:
    - chunk-000

üéûÔ∏è EPISODES (from parquet files)
  ‚Ä¢ Total episodes discovered: 5
  ‚Ä¢ Episode index preview: [0, 1, 2, 3, 4]

üìπ VIDEO KEYS (camera / image modalities)
  ‚Ä¢ Total video keys (cameras): 2
  ‚Ä¢ Video key preview: ['observation.images.front', 'observation.images.wrist']

üìÅ META FILES
  ‚Ä¢ Meta files found: ['episodes.jsonl', 'info.json', 'modality.json', 'relative_stats.json', 'stats.json', 'tasks.jsonl']


## Feature Summary

In [7]:
import json

def read_json(path: Path) -> dict:
    return json.loads(path.read_text())

info = None
info_path = layout.meta_files.get("info.json")
if info_path and info_path.exists():
    info = read_json(info_path)

summary = {
    "root": str(layout.root),
    "chunks": layout.chunks,
    "num_parquet_files": len(layout.parquet_files),
    "num_episodes_found": len(layout.episode_indices),
    "video_keys": layout.video_keys,
    "num_video_keys": len(layout.video_keys),
    "meta_files": sorted(layout.meta_files.keys()),
}

# enrich from info.json if present
if info is not None:
    summary.update({
        "codebase_version": info.get("codebase_version"),
        "robot_type": info.get("robot_type"),
        "total_episodes(meta)": info.get("total_episodes"),
        "total_frames(meta)": info.get("total_frames"),
        "fps(meta)": info.get("fps"),
        "total_videos(meta)": info.get("total_videos"),
        "data_path_template": info.get("data_path"),
        "video_path_template": info.get("video_path"),
        "num_features(meta)": len(info.get("features", {})),
        "feature_keys(meta)": sorted(list(info.get("features", {}).keys()))[:20],
    })

summary


{'root': '/Users/nanzhu/code/Isaac-GR00T/demo_data/cube_to_bowl_5',
 'chunks': ['chunk-000'],
 'num_parquet_files': 5,
 'num_episodes_found': 5,
 'video_keys': ['observation.images.front', 'observation.images.wrist'],
 'num_video_keys': 2,
 'meta_files': ['episodes.jsonl',
  'info.json',
  'modality.json',
  'relative_stats.json',
  'stats.json',
  'tasks.jsonl'],
 'codebase_version': 'v2.1',
 'robot_type': 'so101_follower',
 'total_episodes(meta)': 5,
 'total_frames(meta)': 4148,
 'fps(meta)': 30,
 'total_videos(meta)': 10,
 'data_path_template': 'data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet',
 'video_path_template': 'videos/chunk-{episode_chunk:03d}/{video_key}/episode_{episode_index:06d}.mp4',
 'num_features(meta)': 9,
 'feature_keys(meta)': ['action',
  'episode_index',
  'frame_index',
  'index',
  'observation.images.front',
  'observation.images.wrist',
  'observation.state',
  'task_index',
  'timestamp']}

## Sanity Check

In [9]:
import pandas as pd
from collections import defaultdict

def episode_coverage_report(layout: DatasetLayout) -> pd.DataFrame:
    """
    Return a table:
      key_type | key_name | episodes_present | episodes_missing | coverage_ratio
    key_type includes: parquet, video
    """
    all_eps = set(layout.episode_indices)

    rows = []

    # Parquet coverage (baseline)
    parquet_eps = set(layout.episode_to_parquet.keys())
    rows.append({
        "key_type": "parquet",
        "key_name": "data",
        "episodes_present": len(parquet_eps),
        "episodes_missing": len(all_eps - parquet_eps),
        "coverage_ratio": (len(parquet_eps) / max(1, len(all_eps))),
        "missing_examples": sorted(list(all_eps - parquet_eps))[:10],
    })

    # Video coverage per key
    for vk in layout.video_keys:
        files = layout.video_files.get(vk, [])
        have = set()
        for f in files:
            ei = _parse_episode_index(f)
            if ei is not None:
                have.add(ei)

        miss = sorted(list(all_eps - have))
        rows.append({
            "key_type": "video",
            "key_name": vk,
            "episodes_present": len(have),
            "episodes_missing": len(miss),
            "coverage_ratio": (len(have) / max(1, len(all_eps))),
            "missing_examples": miss[:10],
        })

    df = pd.DataFrame(rows).sort_values(["key_type", "coverage_ratio", "key_name"], ascending=[True, True, True])
    return df

def validate_layout(layout: DatasetLayout, info: dict | None = None) -> dict:
    """
    Returns a structured validation report:
      - summary: high-level counts
      - coverage: per-key episode coverage table
      - meta_counts_check: compare vs info.json (optional)
    Raises on hard errors.
    """
    # Hard errors
    if not layout.root.exists():
        raise FileNotFoundError(f"DATA_ROOT not found: {layout.root}")
    if not layout.data_dir.exists():
        raise FileNotFoundError(f"Missing data dir: {layout.data_dir}")
    if not layout.videos_dir.exists():
        raise FileNotFoundError(f"Missing videos dir: {layout.videos_dir}")
    if len(layout.episode_indices) == 0:
        raise RuntimeError("No episodes found (no episode_*.parquet under data/).")

    # Coverage table (your main ‚Äúsanity check‚Äù)
    coverage = episode_coverage_report(layout)

    # High-level summary
    summary = {
        "root": str(layout.root),
        "chunks": layout.chunks,
        "episodes_found": len(layout.episode_indices),
        "parquet_files_found": len(layout.parquet_files),
        "video_keys_found": len(layout.video_keys),
        "videos_found_total": int(sum(len(v) for v in layout.video_files.values())),
        "meta_files_found": sorted(layout.meta_files.keys()),
    }

    # Optional: compare against info.json
    meta_counts_check = None
    if info is not None:
        meta_counts_check = {
            "info.total_episodes": info.get("total_episodes"),
            "info.total_videos": info.get("total_videos"),
            "episodes_found": len(layout.episode_indices),
            "videos_found_total": int(sum(len(v) for v in layout.video_files.values())),
            "match_total_episodes": (info.get("total_episodes") == len(layout.episode_indices)) if isinstance(info.get("total_episodes"), int) else None,
            "match_total_videos": (info.get("total_videos") == int(sum(len(v) for v in layout.video_files.values()))) if isinstance(info.get("total_videos"), int) else None,
        }

    return {
        "summary": summary,
        "coverage": coverage,
        "meta_counts_check": meta_counts_check,
    }

report = validate_layout(layout, info)
print("‚úÖ Layout validated.\n")
print("SUMMARY:")
for k, v in report["summary"].items():
    print(f"  ‚Ä¢ {k}: {v}")

print("\nCOVERAGE (per key):")
display(report["coverage"].sort_values(["key_type","key_name"]))

if report["meta_counts_check"] is not None:
    print("\nINFO.JSON COUNT CHECK:")
    for k, v in report["meta_counts_check"].items():
        print(f"  ‚Ä¢ {k}: {v}")


‚úÖ Layout validated.

SUMMARY:
  ‚Ä¢ root: /Users/nanzhu/code/Isaac-GR00T/demo_data/cube_to_bowl_5
  ‚Ä¢ chunks: ['chunk-000']
  ‚Ä¢ episodes_found: 5
  ‚Ä¢ parquet_files_found: 5
  ‚Ä¢ video_keys_found: 2
  ‚Ä¢ videos_found_total: 10
  ‚Ä¢ meta_files_found: ['episodes.jsonl', 'info.json', 'modality.json', 'relative_stats.json', 'stats.json', 'tasks.jsonl']

COVERAGE (per key):


Unnamed: 0,key_type,key_name,episodes_present,episodes_missing,coverage_ratio,missing_examples
0,parquet,data,5,0,1.0,[]
1,video,observation.images.front,5,0,1.0,[]
2,video,observation.images.wrist,5,0,1.0,[]



INFO.JSON COUNT CHECK:
  ‚Ä¢ info.total_episodes: 5
  ‚Ä¢ info.total_videos: 10
  ‚Ä¢ episodes_found: 5
  ‚Ä¢ videos_found_total: 10
  ‚Ä¢ match_total_episodes: True
  ‚Ä¢ match_total_videos: True
