diff --git a/.gitignore b/.gitignore index c16502e..e951f71 100644 --- a/.gitignore +++ b/.gitignore @@ -215,6 +215,8 @@ temp/* # Data and asset cache live in avdp-synth-corpus, not here assets/ data/ +# But include bundled package data +!synthbanshee/data/ # Splendor — durable: wiki/, planning/, splendor.yaml, state/manifests/ # Transient operational state (not tracked) diff --git a/AGENTS.md b/AGENTS.md index a3d36cf..bf54d8b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,6 +7,7 @@ ``` synthbanshee/ ← main package config/ ← Pydantic models (scene, speaker, acoustic, run) + data/ ← bundled data files (taxonomy.yaml) script/ ← LLM script generation + Jinja2 templates tts/ ← TTS rendering (Azure/Google) + SceneMixer augment/ ← preprocessing, room IR, device profiles, noise @@ -56,7 +57,7 @@ SFX onset/offset times come **only** from the Stage 3b augmentation log. Speech- ## Label taxonomy -- **Source of truth: `configs/taxonomy.yaml`** — never hardcode label strings in application code +- **Source of truth: `synthbanshee/data/taxonomy.yaml`** — never hardcode label strings in application code - Three levels: `violence_typology` → `tier1_category` → `tier2_subtype` - `has_violence` is a **derived convenience field**; the full taxonomy columns are ground truth - `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps `(typology, intensity)` → `(tier1, tier2)`. If you add a typology to the taxonomy, add a row to the map — missing entries fall through to `("NONE", "NONE_AMBIENT")` silently diff --git a/README.md b/README.md index ef915e6..d9c6170 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Labels use a three-level hierarchy. `has_violence` (in clip metadata and manifes 2. **Tier 1 category** (event): `PHYS` · `VERB` · `DIST` · `ACOU` · `EMOT` · `NONE` 3. **Tier 2 subtype** (event): e.g. `PHYS_HARD` · `VERB_THREAT` · `DIST_SCREAM` -Full taxonomy: `configs/taxonomy.yaml`. +Full taxonomy: `synthbanshee/data/taxonomy.yaml`. --- diff --git a/docs/audio_generation_v3_design.md b/docs/audio_generation_v3_design.md index b3f6b83..1edeb94 100644 --- a/docs/audio_generation_v3_design.md +++ b/docs/audio_generation_v3_design.md @@ -99,7 +99,7 @@ The scripts, label taxonomy, pipeline stage decomposition, and cache system are ## 3. What Does NOT Need to Change - **Script generation** (LLM + Jinja2 templates) — produces structurally correct dialogue; the problem is in rendering, not generation -- **Label taxonomy** (`configs/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate +- **Label taxonomy** (`synthbanshee/data/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate - **Pipeline stage decomposition** — the five stages are correctly separated - **TTS cache** (SHA-256 of full SSML string) — correct; will naturally miss on any SSML change - **Acoustic augmentation** (Tier B) — still needed and architecturally correct; runs after V3 audio assembly @@ -386,7 +386,7 @@ This is not just a data cleanliness issue — an emotion-conditioned model that 1. **Remove the silent downgrade.** `_normalize_emotion()` should log a warning and either (a) fail loudly for known invalid emotional states, or (b) map only to the nearest valid state with an explicit mapping table, not to `neutral` as a catch-all. -2. **Extend `configs/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback. +2. **Extend `synthbanshee/data/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback. 3. **Add a QA check:** `qa.py` should flag any clip where the number of turns with emotional_state = `neutral` in the JSONL exceeds the number of turns with intensity ≤ 2 in the script. If I4 and I5 turns are being labeled as `neutral`, the normalization fallback has been triggered. diff --git a/docs/implementation_plan.md b/docs/implementation_plan.md index 363a40e..075df04 100644 --- a/docs/implementation_plan.md +++ b/docs/implementation_plan.md @@ -65,7 +65,6 @@ SynthBanshee/ │ ├── speakers/ # Speaker persona YAMLs │ ├── acoustic_scenes/ # Room/device YAML configs │ ├── run_configs/ # Full generation run definitions (RunConfig YAML) -│ ├── taxonomy.yaml # Label taxonomy — single source of truth │ └── examples/ # Worked examples for each config type ├── assets/ # Source assets (gitignored; populated at runtime) │ ├── speech/ # TTS utterance cache (SHA-256 keyed) @@ -239,7 +238,7 @@ Each template defines: **Acceptance criteria:** 50 scripts per project generated; zero transcript redundancy violations; 10% manual spot-check passes plausibility review. -**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `configs/taxonomy.yaml` at import time. +**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `synthbanshee/data/taxonomy.yaml` at import time. ### 1.4 Tier A Dataset Generation Run diff --git a/llms.txt b/llms.txt index 590dd94..3eee27d 100644 --- a/llms.txt +++ b/llms.txt @@ -54,7 +54,7 @@ Synthetic Hebrew audio dataset generator for two DataHack AI safety projects: - synthbanshee/config/taxonomy.py — taxonomy loader helpers ## Config files (YAML) -- configs/taxonomy.yaml — label taxonomy (source of truth for all label strings) +- synthbanshee/data/taxonomy.yaml — label taxonomy (source of truth for all label strings) - configs/examples/scene_*.yaml — worked scene examples - configs/examples/speaker_*.yaml — worked speaker persona examples (with rms_target_dbfs) - synthbanshee/config/acoustic_config.py — room + device validity/source of truth (currently hardcoded sets; configs/acoustic_scenes/ is user-created if needed) diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 1022857..80cd3d0 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -45,7 +45,7 @@ class DiscoveredScene(NamedTuple): # Semantic mapping: (violence_typology) → ordered list of (max_intensity, tier1, tier2). # Each entry matches intensities ≤ max_intensity; the last entry is the catch-all. -# Validated against configs/taxonomy.yaml at import time (see _validate_event_type_codes). +# Validated against synthbanshee/data/taxonomy.yaml at import time (see _validate_event_type_codes). _TYPOLOGY_INTENSITY_MAP: dict[str, list[tuple[int, str, str]]] = { "NEU": [(5, "NONE", "NONE_AMBIENT")], "NEG": [(5, "NONE", "NONE_ARGU")], @@ -65,7 +65,7 @@ class DiscoveredScene(NamedTuple): def _validate_event_type_codes() -> None: - """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per configs/taxonomy.yaml.""" + """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per synthbanshee/data/taxonomy.yaml.""" from synthbanshee.config.taxonomy import tier1_category_codes, tier2_subtype_codes valid_tier1 = tier1_category_codes() @@ -187,7 +187,7 @@ def _normalize_emotion(state: str) -> tuple[str, bool]: return canonical, True raise ValueError( f"Unknown emotional_state {state!r}. " - "Add it to configs/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py." + "Map it in _EMOTION_ALIASES in cli.py, or add it to taxonomy.yaml and reinstall." ) diff --git a/synthbanshee/config/taxonomy.py b/synthbanshee/config/taxonomy.py index a8e2741..eb8f5ae 100644 --- a/synthbanshee/config/taxonomy.py +++ b/synthbanshee/config/taxonomy.py @@ -1,23 +1,23 @@ """Taxonomy loader — single source of truth for all AVDP label codes. -All label codes must be loaded from configs/taxonomy.yaml, never hardcoded. +All label codes must be loaded from synthbanshee/data/taxonomy.yaml, never hardcoded. """ from __future__ import annotations from functools import lru_cache -from pathlib import Path +from importlib import resources import yaml -_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "configs" / "taxonomy.yaml" +import synthbanshee.data as _data_pkg @lru_cache(maxsize=1) def load_taxonomy() -> dict: """Load and cache the taxonomy YAML. Call this instead of hardcoding codes.""" - with _TAXONOMY_PATH.open("r", encoding="utf-8") as fh: - return yaml.safe_load(fh) + ref = resources.files(_data_pkg).joinpath("taxonomy.yaml") + return yaml.safe_load(ref.read_text(encoding="utf-8")) def violence_typology_codes() -> frozenset[str]: diff --git a/synthbanshee/data/__init__.py b/synthbanshee/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/taxonomy.yaml b/synthbanshee/data/taxonomy.yaml similarity index 100% rename from configs/taxonomy.yaml rename to synthbanshee/data/taxonomy.yaml diff --git a/synthbanshee/labels/iaa.py b/synthbanshee/labels/iaa.py index f845d8a..b5cebbe 100644 --- a/synthbanshee/labels/iaa.py +++ b/synthbanshee/labels/iaa.py @@ -314,7 +314,7 @@ def summary(self) -> str: # Validate prefixes against taxonomy at import time so a rename in -# configs/taxonomy.yaml causes an immediate, loud failure here. +# synthbanshee/data/taxonomy.yaml causes an immediate, loud failure here. def _validate_category_prefixes() -> None: from synthbanshee.config.taxonomy import tier1_category_codes diff --git a/synthbanshee/package/dataset_card.py b/synthbanshee/package/dataset_card.py index 2c5e52a..3158ecb 100644 --- a/synthbanshee/package/dataset_card.py +++ b/synthbanshee/package/dataset_card.py @@ -139,7 +139,7 @@ def _pct(n: int) -> str: ### Label Taxonomy -Labels use a three-level hierarchy (defined in `configs/taxonomy.yaml`): +Labels use a three-level hierarchy (defined in `synthbanshee/data/taxonomy.yaml`): - **Violence typology** (scene-level): `SV`, `IT`, `NEG`, `NEU` - **Tier 1 category** (event-level): `PHYS`, `VERB`, `DIST`, `ACOU`, `EMOT`, `NONE`