# Flex Dataset Demo (v4.x)

FLAC audio files + parquet metadata. Uses HuggingFace `datasets` v4.x parquet auto-discovery.

*Generated by Claude*

In [None]:
# HuggingFace v4.x loading (parquet auto-discovery)
from datasets import load_dataset

# Load recordings metadata
ds_rec = load_dataset("../datasets/flex", "recordings", streaming=True)
rec = next(iter(ds_rec["train"]))
print(f"Recording: {rec['recording_id']}")
print(f"  Duration: {rec['duration_sec']:.1f}s")
print(f"  Sample rate: {rec['sample_rate']} Hz")

# Load utterances metadata
ds_utt = load_dataset("../datasets/flex", "utterances", streaming=True)
utt = next(iter(ds_utt["train"]))
print("\nUtterance:")
print(f"  Speaker: {utt['speaker_name']}")
print(f"  Text: {utt['text'][:80]}...")
print(f"  Duration: {utt['duration_sec']:.1f}s")

In [None]:
from pathlib import Path

from IPython.display import Audio

from oyez_sa_asr.loaders import extract_segment

# Extract and play utterance segment using native loader
audio_path = Path("../datasets/flex/audio") / rec["audio_path"]
segment, sr = extract_segment(audio_path, utt["start_sec"], utt["end_sec"])

print(f"Playing segment: {utt['start_sec']:.1f}s - {utt['end_sec']:.1f}s")
Audio(data=segment, rate=sr)

## Alternative: Native Python Loader

For segment extraction, use the native loader with `extract_segment()`:

In [None]:
from oyez_sa_asr.loaders import load_flex

# Native loader returns full lists
recordings, utterances = load_flex()
print(f"Recordings: {len(recordings):,}")
print(f"Utterances: {len(utterances):,}")
print(f"Valid utterances: {sum(1 for u in utterances if u.get('valid', True)):,}")