# Simple Dataset Demo (v4.x)

Embedded audio in parquet. Uses HuggingFace `datasets` v4.x parquet auto-discovery.

*Generated by Claude*

In [None]:
# HuggingFace v4.x loading (parquet auto-discovery)
from datasets import Audio, load_dataset

# Load lt1m split (utterances < 1 minute)
# Note: Use decode=False to avoid torchcodec requirement on some platforms
ds = load_dataset("../datasets/simple", "lt1m", streaming=True)
ds = ds.cast_column("audio", Audio(decode=False))
sample = next(iter(ds["train"]))

print(f"Sentence: {sample['sentence'][:100]}...")
print(f"Speaker: {sample.get('speaker', 'N/A')}")
print(f"Duration: {sample.get('duration', 'N/A')}s")

In [None]:
import io

from IPython.display import Audio as IPAudio

# Audio is embedded as bytes - decode with soundfile/av
audio_bytes = sample["audio"]["bytes"]

# Try soundfile first, fall back to av
try:
    import soundfile as sf

    audio_array, sr = sf.read(io.BytesIO(audio_bytes))
except ImportError:
    import av
    import numpy as np

    container = av.open(io.BytesIO(audio_bytes))
    frames = [f.to_ndarray() for f in container.decode(audio=0)]
    audio_array = np.concatenate(frames, axis=1).flatten()
    sr = container.streams.audio[0].rate
    container.close()

print(f"Audio: {len(audio_array)} samples @ {sr} Hz")
IPAudio(data=audio_array, rate=sr)

## Available Splits

- `lt1m`: Utterances < 1 minute (most common)
- `lt5m`: Utterances 1-5 minutes
- `lt30m`: Utterances 5-30 minutes

In [None]:
# Load different split
ds_5m = load_dataset("../datasets/simple", "lt5m", streaming=True)
ds_5m = ds_5m.cast_column("audio", Audio(decode=False))
sample_5m = next(iter(ds_5m["train"]))
print(f"lt5m sample duration: {sample_5m.get('duration', 'N/A')}s")

## Alternative: Native Python Loader

In [None]:
from oyez_sa_asr.loaders import load_simple_hf

# Wrapper function with streaming support
ds = load_simple_hf("lt1m", streaming=True)
sample = next(iter(ds))
print(f"Loaded via wrapper: {sample['sentence'][:50]}...")