# Raw Dataset Demo (v4.x)

Original MP3/OGG audio + JSON metadata. Uses HuggingFace `datasets` v4.x parquet auto-discovery.

*Generated by Claude*

In [None]:
# HuggingFace v4.x loading (parquet auto-discovery)
from datasets import load_dataset

# Load raw dataset via parquet auto-discovery
ds = load_dataset("../datasets/raw", streaming=True)
print("Dataset loaded via HuggingFace auto-discovery")

# Get first sample
sample = next(iter(ds["train"]))
print("\nFirst recording:")
print(f"  ID: {sample['recording_id']}")
print(f"  Term: {sample['term']}")
print(f"  Docket: {sample['docket']}")
print(f"  Audio path: {sample['audio_path']}")

In [None]:
from pathlib import Path

from IPython.display import Audio, display

# Play audio from path
audio_file = Path("../datasets/raw/audio") / sample["audio_path"]
print(f"Playing: {audio_file.name}")

# Display first 30 seconds of audio (raw files can be long)
display(Audio(filename=str(audio_file), normalize=True))

## Alternative: Native Python Loader

For full metadata access (transcripts, case info), use the native loader:

In [None]:
import json

from oyez_sa_asr.loaders import load_raw

# Native loader provides full metadata access
raw = load_raw()
item = raw[0]

print(f"Recordings: {len(raw):,}")

# Access linked transcript
if item["transcript_path"]:
    with item["transcript_path"].open() as f:
        transcript = json.load(f)
    print(f"\nTranscript: {transcript.get('title', 'N/A')}")

# Access linked case metadata
if item["case_path"]:
    with item["case_path"].open() as f:
        case = json.load(f)
    print(f"Case: {case.get('name', 'N/A')}")