From fc89be79ef1581ed4130768dfaa80c4c266a044c Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sat, 2 May 2026 13:47:54 +0300 Subject: [PATCH 1/3] chore: bundle taxonomy.yaml and templates in wheel (#4) Move configs/taxonomy.yaml into synthbanshee/data/ and load it via importlib.resources so the CLI works when installed from a wheel. Add Hatch build artifacts config for *.yaml, *.j2, and *.txt files inside the package tree. Closes #4 Co-Authored-By: Claude Opus 4.6 --- .gitignore | 2 ++ AGENTS.md | 3 ++- README.md | 2 +- pyproject.toml | 7 +++++++ synthbanshee/cli.py | 6 +++--- synthbanshee/config/taxonomy.py | 10 ++++------ synthbanshee/data/__init__.py | 0 {configs => synthbanshee/data}/taxonomy.yaml | 0 synthbanshee/labels/iaa.py | 2 +- synthbanshee/package/dataset_card.py | 2 +- 10 files changed, 21 insertions(+), 13 deletions(-) create mode 100644 synthbanshee/data/__init__.py rename {configs => synthbanshee/data}/taxonomy.yaml (100%) diff --git a/.gitignore b/.gitignore index c16502e..e951f71 100644 --- a/.gitignore +++ b/.gitignore @@ -215,6 +215,8 @@ temp/* # Data and asset cache live in avdp-synth-corpus, not here assets/ data/ +# But include bundled package data +!synthbanshee/data/ # Splendor — durable: wiki/, planning/, splendor.yaml, state/manifests/ # Transient operational state (not tracked) diff --git a/AGENTS.md b/AGENTS.md index a3d36cf..bf54d8b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,6 +7,7 @@ ``` synthbanshee/ ← main package config/ ← Pydantic models (scene, speaker, acoustic, run) + data/ ← bundled data files (taxonomy.yaml) script/ ← LLM script generation + Jinja2 templates tts/ ← TTS rendering (Azure/Google) + SceneMixer augment/ ← preprocessing, room IR, device profiles, noise @@ -56,7 +57,7 @@ SFX onset/offset times come **only** from the Stage 3b augmentation log. Speech- ## Label taxonomy -- **Source of truth: `configs/taxonomy.yaml`** — never hardcode label strings in application code +- **Source of truth: `synthbanshee/data/taxonomy.yaml`** — never hardcode label strings in application code - Three levels: `violence_typology` → `tier1_category` → `tier2_subtype` - `has_violence` is a **derived convenience field**; the full taxonomy columns are ground truth - `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps `(typology, intensity)` → `(tier1, tier2)`. If you add a typology to the taxonomy, add a row to the map — missing entries fall through to `("NONE", "NONE_AMBIENT")` silently diff --git a/README.md b/README.md index ef915e6..d9c6170 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Labels use a three-level hierarchy. `has_violence` (in clip metadata and manifes 2. **Tier 1 category** (event): `PHYS` · `VERB` · `DIST` · `ACOU` · `EMOT` · `NONE` 3. **Tier 2 subtype** (event): e.g. `PHYS_HARD` · `VERB_THREAT` · `DIST_SCREAM` -Full taxonomy: `configs/taxonomy.yaml`. +Full taxonomy: `synthbanshee/data/taxonomy.yaml`. --- diff --git a/pyproject.toml b/pyproject.toml index c914b49..c16c845 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,6 +72,13 @@ dev = [ [project.scripts] synthbanshee = "synthbanshee.cli:cli" +[tool.hatch.build] +artifacts = [ + "synthbanshee/**/*.yaml", + "synthbanshee/**/*.j2", + "synthbanshee/**/*.txt", +] + [tool.hatch.build.targets.wheel] packages = ["synthbanshee"] diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 1022857..4ff77f9 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -45,7 +45,7 @@ class DiscoveredScene(NamedTuple): # Semantic mapping: (violence_typology) → ordered list of (max_intensity, tier1, tier2). # Each entry matches intensities ≤ max_intensity; the last entry is the catch-all. -# Validated against configs/taxonomy.yaml at import time (see _validate_event_type_codes). +# Validated against synthbanshee/data/taxonomy.yaml at import time (see _validate_event_type_codes). _TYPOLOGY_INTENSITY_MAP: dict[str, list[tuple[int, str, str]]] = { "NEU": [(5, "NONE", "NONE_AMBIENT")], "NEG": [(5, "NONE", "NONE_ARGU")], @@ -65,7 +65,7 @@ class DiscoveredScene(NamedTuple): def _validate_event_type_codes() -> None: - """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per configs/taxonomy.yaml.""" + """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per synthbanshee/data/taxonomy.yaml.""" from synthbanshee.config.taxonomy import tier1_category_codes, tier2_subtype_codes valid_tier1 = tier1_category_codes() @@ -187,7 +187,7 @@ def _normalize_emotion(state: str) -> tuple[str, bool]: return canonical, True raise ValueError( f"Unknown emotional_state {state!r}. " - "Add it to configs/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py." + "Add it to synthbanshee/data/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py." ) diff --git a/synthbanshee/config/taxonomy.py b/synthbanshee/config/taxonomy.py index a8e2741..e487998 100644 --- a/synthbanshee/config/taxonomy.py +++ b/synthbanshee/config/taxonomy.py @@ -1,23 +1,21 @@ """Taxonomy loader — single source of truth for all AVDP label codes. -All label codes must be loaded from configs/taxonomy.yaml, never hardcoded. +All label codes must be loaded from synthbanshee/data/taxonomy.yaml, never hardcoded. """ from __future__ import annotations from functools import lru_cache -from pathlib import Path +from importlib import resources import yaml -_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "configs" / "taxonomy.yaml" - @lru_cache(maxsize=1) def load_taxonomy() -> dict: """Load and cache the taxonomy YAML. Call this instead of hardcoding codes.""" - with _TAXONOMY_PATH.open("r", encoding="utf-8") as fh: - return yaml.safe_load(fh) + ref = resources.files("synthbanshee.data").joinpath("taxonomy.yaml") + return yaml.safe_load(ref.read_text(encoding="utf-8")) def violence_typology_codes() -> frozenset[str]: diff --git a/synthbanshee/data/__init__.py b/synthbanshee/data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/taxonomy.yaml b/synthbanshee/data/taxonomy.yaml similarity index 100% rename from configs/taxonomy.yaml rename to synthbanshee/data/taxonomy.yaml diff --git a/synthbanshee/labels/iaa.py b/synthbanshee/labels/iaa.py index f845d8a..b5cebbe 100644 --- a/synthbanshee/labels/iaa.py +++ b/synthbanshee/labels/iaa.py @@ -314,7 +314,7 @@ def summary(self) -> str: # Validate prefixes against taxonomy at import time so a rename in -# configs/taxonomy.yaml causes an immediate, loud failure here. +# synthbanshee/data/taxonomy.yaml causes an immediate, loud failure here. def _validate_category_prefixes() -> None: from synthbanshee.config.taxonomy import tier1_category_codes diff --git a/synthbanshee/package/dataset_card.py b/synthbanshee/package/dataset_card.py index 2c5e52a..3158ecb 100644 --- a/synthbanshee/package/dataset_card.py +++ b/synthbanshee/package/dataset_card.py @@ -139,7 +139,7 @@ def _pct(n: int) -> str: ### Label Taxonomy -Labels use a three-level hierarchy (defined in `configs/taxonomy.yaml`): +Labels use a three-level hierarchy (defined in `synthbanshee/data/taxonomy.yaml`): - **Violence typology** (scene-level): `SV`, `IT`, `NEG`, `NEU` - **Tier 1 category** (event-level): `PHYS`, `VERB`, `DIST`, `ACOU`, `EMOT`, `NONE` From 4541f9076693f3dcb8b471d2e98a0ce0f69299cc Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sat, 2 May 2026 14:28:37 +0300 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20address=20Copilot=20review=20?= =?UTF-8?q?=E2=80=94=20use=20module=20import=20for=20importlib.resources,?= =?UTF-8?q?=20fix=20error=20message?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Pass `synthbanshee.data` module object to `resources.files()` instead of a hardcoded string (COPILOT-2). - Rephrase error message to direct users to `_EMOTION_ALIASES` or reinstall, since bundled YAML isn't user-editable in site-packages (COPILOT-1). Co-Authored-By: Claude Opus 4.6 --- synthbanshee/cli.py | 2 +- synthbanshee/config/taxonomy.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py index 4ff77f9..80cd3d0 100644 --- a/synthbanshee/cli.py +++ b/synthbanshee/cli.py @@ -187,7 +187,7 @@ def _normalize_emotion(state: str) -> tuple[str, bool]: return canonical, True raise ValueError( f"Unknown emotional_state {state!r}. " - "Add it to synthbanshee/data/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py." + "Map it in _EMOTION_ALIASES in cli.py, or add it to taxonomy.yaml and reinstall." ) diff --git a/synthbanshee/config/taxonomy.py b/synthbanshee/config/taxonomy.py index e487998..eb8f5ae 100644 --- a/synthbanshee/config/taxonomy.py +++ b/synthbanshee/config/taxonomy.py @@ -10,11 +10,13 @@ import yaml +import synthbanshee.data as _data_pkg + @lru_cache(maxsize=1) def load_taxonomy() -> dict: """Load and cache the taxonomy YAML. Call this instead of hardcoding codes.""" - ref = resources.files("synthbanshee.data").joinpath("taxonomy.yaml") + ref = resources.files(_data_pkg).joinpath("taxonomy.yaml") return yaml.safe_load(ref.read_text(encoding="utf-8")) From 8a22f7640e5f8baade5b13d5819e002566aebd3c Mon Sep 17 00:00:00 2001 From: Shay Palachy Date: Sat, 2 May 2026 14:47:06 +0300 Subject: [PATCH 3/3] fix: remove unnecessary artifacts config, update stale doc references MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove `[tool.hatch.build] artifacts` — Hatch already includes non-py files inside declared packages by default (verified by inspecting built wheel). - Update remaining `configs/taxonomy.yaml` references in docs/implementation_plan.md, docs/audio_generation_v3_design.md, and llms.txt. Co-Authored-By: Claude Opus 4.6 --- docs/audio_generation_v3_design.md | 4 ++-- docs/implementation_plan.md | 3 +-- llms.txt | 2 +- pyproject.toml | 7 ------- 4 files changed, 4 insertions(+), 12 deletions(-) diff --git a/docs/audio_generation_v3_design.md b/docs/audio_generation_v3_design.md index b3f6b83..1edeb94 100644 --- a/docs/audio_generation_v3_design.md +++ b/docs/audio_generation_v3_design.md @@ -99,7 +99,7 @@ The scripts, label taxonomy, pipeline stage decomposition, and cache system are ## 3. What Does NOT Need to Change - **Script generation** (LLM + Jinja2 templates) — produces structurally correct dialogue; the problem is in rendering, not generation -- **Label taxonomy** (`configs/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate +- **Label taxonomy** (`synthbanshee/data/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate - **Pipeline stage decomposition** — the five stages are correctly separated - **TTS cache** (SHA-256 of full SSML string) — correct; will naturally miss on any SSML change - **Acoustic augmentation** (Tier B) — still needed and architecturally correct; runs after V3 audio assembly @@ -386,7 +386,7 @@ This is not just a data cleanliness issue — an emotion-conditioned model that 1. **Remove the silent downgrade.** `_normalize_emotion()` should log a warning and either (a) fail loudly for known invalid emotional states, or (b) map only to the nearest valid state with an explicit mapping table, not to `neutral` as a catch-all. -2. **Extend `configs/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback. +2. **Extend `synthbanshee/data/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback. 3. **Add a QA check:** `qa.py` should flag any clip where the number of turns with emotional_state = `neutral` in the JSONL exceeds the number of turns with intensity ≤ 2 in the script. If I4 and I5 turns are being labeled as `neutral`, the normalization fallback has been triggered. diff --git a/docs/implementation_plan.md b/docs/implementation_plan.md index 363a40e..075df04 100644 --- a/docs/implementation_plan.md +++ b/docs/implementation_plan.md @@ -65,7 +65,6 @@ SynthBanshee/ │ ├── speakers/ # Speaker persona YAMLs │ ├── acoustic_scenes/ # Room/device YAML configs │ ├── run_configs/ # Full generation run definitions (RunConfig YAML) -│ ├── taxonomy.yaml # Label taxonomy — single source of truth │ └── examples/ # Worked examples for each config type ├── assets/ # Source assets (gitignored; populated at runtime) │ ├── speech/ # TTS utterance cache (SHA-256 keyed) @@ -239,7 +238,7 @@ Each template defines: **Acceptance criteria:** 50 scripts per project generated; zero transcript redundancy violations; 10% manual spot-check passes plausibility review. -**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `configs/taxonomy.yaml` at import time. +**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `synthbanshee/data/taxonomy.yaml` at import time. ### 1.4 Tier A Dataset Generation Run diff --git a/llms.txt b/llms.txt index 590dd94..3eee27d 100644 --- a/llms.txt +++ b/llms.txt @@ -54,7 +54,7 @@ Synthetic Hebrew audio dataset generator for two DataHack AI safety projects: - synthbanshee/config/taxonomy.py — taxonomy loader helpers ## Config files (YAML) -- configs/taxonomy.yaml — label taxonomy (source of truth for all label strings) +- synthbanshee/data/taxonomy.yaml — label taxonomy (source of truth for all label strings) - configs/examples/scene_*.yaml — worked scene examples - configs/examples/speaker_*.yaml — worked speaker persona examples (with rms_target_dbfs) - synthbanshee/config/acoustic_config.py — room + device validity/source of truth (currently hardcoded sets; configs/acoustic_scenes/ is user-created if needed) diff --git a/pyproject.toml b/pyproject.toml index c16c845..c914b49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -72,13 +72,6 @@ dev = [ [project.scripts] synthbanshee = "synthbanshee.cli:cli" -[tool.hatch.build] -artifacts = [ - "synthbanshee/**/*.yaml", - "synthbanshee/**/*.j2", - "synthbanshee/**/*.txt", -] - [tool.hatch.build.targets.wheel] packages = ["synthbanshee"]