From fc89be79ef1581ed4130768dfaa80c4c266a044c Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Sat, 2 May 2026 13:47:54 +0300
Subject: [PATCH 1/3] chore: bundle taxonomy.yaml and templates in wheel (#4)

Move configs/taxonomy.yaml into synthbanshee/data/ and load it via
importlib.resources so the CLI works when installed from a wheel.
Add Hatch build artifacts config for *.yaml, *.j2, and *.txt files
inside the package tree.

Closes #4

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .gitignore                                   |  2 ++
 AGENTS.md                                    |  3 ++-
 README.md                                    |  2 +-
 pyproject.toml                               |  7 +++++++
 synthbanshee/cli.py                          |  6 +++---
 synthbanshee/config/taxonomy.py              | 10 ++++------
 synthbanshee/data/__init__.py                |  0
 {configs => synthbanshee/data}/taxonomy.yaml |  0
 synthbanshee/labels/iaa.py                   |  2 +-
 synthbanshee/package/dataset_card.py         |  2 +-
 10 files changed, 21 insertions(+), 13 deletions(-)
 create mode 100644 synthbanshee/data/__init__.py
 rename {configs => synthbanshee/data}/taxonomy.yaml (100%)

diff --git a/.gitignore b/.gitignore
index c16502e..e951f71 100644
--- a/.gitignore
+++ b/.gitignore
@@ -215,6 +215,8 @@ temp/*
 # Data and asset cache live in avdp-synth-corpus, not here
 assets/
 data/
+# But include bundled package data
+!synthbanshee/data/
 
 # Splendor — durable: wiki/, planning/, splendor.yaml, state/manifests/
 # Transient operational state (not tracked)
diff --git a/AGENTS.md b/AGENTS.md
index a3d36cf..bf54d8b 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -7,6 +7,7 @@
 ```
 synthbanshee/       ← main package
   config/           ← Pydantic models (scene, speaker, acoustic, run)
+  data/             ← bundled data files (taxonomy.yaml)
   script/           ← LLM script generation + Jinja2 templates
   tts/              ← TTS rendering (Azure/Google) + SceneMixer
   augment/          ← preprocessing, room IR, device profiles, noise
@@ -56,7 +57,7 @@ SFX onset/offset times come **only** from the Stage 3b augmentation log. Speech-
 
 ## Label taxonomy
 
-- **Source of truth: `configs/taxonomy.yaml`** — never hardcode label strings in application code
+- **Source of truth: `synthbanshee/data/taxonomy.yaml`** — never hardcode label strings in application code
 - Three levels: `violence_typology` → `tier1_category` → `tier2_subtype`
 - `has_violence` is a **derived convenience field**; the full taxonomy columns are ground truth
 - `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps `(typology, intensity)` → `(tier1, tier2)`. If you add a typology to the taxonomy, add a row to the map — missing entries fall through to `("NONE", "NONE_AMBIENT")` silently
diff --git a/README.md b/README.md
index ef915e6..d9c6170 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ Labels use a three-level hierarchy. `has_violence` (in clip metadata and manifes
 2. **Tier 1 category** (event): `PHYS` · `VERB` · `DIST` · `ACOU` · `EMOT` · `NONE`
 3. **Tier 2 subtype** (event): e.g. `PHYS_HARD` · `VERB_THREAT` · `DIST_SCREAM`
 
-Full taxonomy: `configs/taxonomy.yaml`.
+Full taxonomy: `synthbanshee/data/taxonomy.yaml`.
 
 ---
 
diff --git a/pyproject.toml b/pyproject.toml
index c914b49..c16c845 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,6 +72,13 @@ dev = [
 [project.scripts]
 synthbanshee = "synthbanshee.cli:cli"
 
+[tool.hatch.build]
+artifacts = [
+    "synthbanshee/**/*.yaml",
+    "synthbanshee/**/*.j2",
+    "synthbanshee/**/*.txt",
+]
+
 [tool.hatch.build.targets.wheel]
 packages = ["synthbanshee"]
 
diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py
index 1022857..4ff77f9 100644
--- a/synthbanshee/cli.py
+++ b/synthbanshee/cli.py
@@ -45,7 +45,7 @@ class DiscoveredScene(NamedTuple):
 
 # Semantic mapping: (violence_typology) → ordered list of (max_intensity, tier1, tier2).
 # Each entry matches intensities ≤ max_intensity; the last entry is the catch-all.
-# Validated against configs/taxonomy.yaml at import time (see _validate_event_type_codes).
+# Validated against synthbanshee/data/taxonomy.yaml at import time (see _validate_event_type_codes).
 _TYPOLOGY_INTENSITY_MAP: dict[str, list[tuple[int, str, str]]] = {
     "NEU": [(5, "NONE", "NONE_AMBIENT")],
     "NEG": [(5, "NONE", "NONE_ARGU")],
@@ -65,7 +65,7 @@ class DiscoveredScene(NamedTuple):
 
 
 def _validate_event_type_codes() -> None:
-    """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per configs/taxonomy.yaml."""
+    """Assert every code in _TYPOLOGY_INTENSITY_MAP is valid per synthbanshee/data/taxonomy.yaml."""
     from synthbanshee.config.taxonomy import tier1_category_codes, tier2_subtype_codes
 
     valid_tier1 = tier1_category_codes()
@@ -187,7 +187,7 @@ def _normalize_emotion(state: str) -> tuple[str, bool]:
         return canonical, True
     raise ValueError(
         f"Unknown emotional_state {state!r}. "
-        "Add it to configs/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py."
+        "Add it to synthbanshee/data/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py."
     )
 
 
diff --git a/synthbanshee/config/taxonomy.py b/synthbanshee/config/taxonomy.py
index a8e2741..e487998 100644
--- a/synthbanshee/config/taxonomy.py
+++ b/synthbanshee/config/taxonomy.py
@@ -1,23 +1,21 @@
 """Taxonomy loader — single source of truth for all AVDP label codes.
 
-All label codes must be loaded from configs/taxonomy.yaml, never hardcoded.
+All label codes must be loaded from synthbanshee/data/taxonomy.yaml, never hardcoded.
 """
 
 from __future__ import annotations
 
 from functools import lru_cache
-from pathlib import Path
+from importlib import resources
 
 import yaml
 
-_TAXONOMY_PATH = Path(__file__).parent.parent.parent / "configs" / "taxonomy.yaml"
-
 
 @lru_cache(maxsize=1)
 def load_taxonomy() -> dict:
     """Load and cache the taxonomy YAML. Call this instead of hardcoding codes."""
-    with _TAXONOMY_PATH.open("r", encoding="utf-8") as fh:
-        return yaml.safe_load(fh)
+    ref = resources.files("synthbanshee.data").joinpath("taxonomy.yaml")
+    return yaml.safe_load(ref.read_text(encoding="utf-8"))
 
 
 def violence_typology_codes() -> frozenset[str]:
diff --git a/synthbanshee/data/__init__.py b/synthbanshee/data/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/configs/taxonomy.yaml b/synthbanshee/data/taxonomy.yaml
similarity index 100%
rename from configs/taxonomy.yaml
rename to synthbanshee/data/taxonomy.yaml
diff --git a/synthbanshee/labels/iaa.py b/synthbanshee/labels/iaa.py
index f845d8a..b5cebbe 100644
--- a/synthbanshee/labels/iaa.py
+++ b/synthbanshee/labels/iaa.py
@@ -314,7 +314,7 @@ def summary(self) -> str:
 
 
 # Validate prefixes against taxonomy at import time so a rename in
-# configs/taxonomy.yaml causes an immediate, loud failure here.
+# synthbanshee/data/taxonomy.yaml causes an immediate, loud failure here.
 def _validate_category_prefixes() -> None:
     from synthbanshee.config.taxonomy import tier1_category_codes
 
diff --git a/synthbanshee/package/dataset_card.py b/synthbanshee/package/dataset_card.py
index 2c5e52a..3158ecb 100644
--- a/synthbanshee/package/dataset_card.py
+++ b/synthbanshee/package/dataset_card.py
@@ -139,7 +139,7 @@ def _pct(n: int) -> str:
 
 ### Label Taxonomy
 
-Labels use a three-level hierarchy (defined in `configs/taxonomy.yaml`):
+Labels use a three-level hierarchy (defined in `synthbanshee/data/taxonomy.yaml`):
 
 - **Violence typology** (scene-level): `SV`, `IT`, `NEG`, `NEU`
 - **Tier 1 category** (event-level): `PHYS`, `VERB`, `DIST`, `ACOU`, `EMOT`, `NONE`

From 4541f9076693f3dcb8b471d2e98a0ce0f69299cc Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Sat, 2 May 2026 14:28:37 +0300
Subject: [PATCH 2/3] =?UTF-8?q?fix:=20address=20Copilot=20review=20?=
 =?UTF-8?q?=E2=80=94=20use=20module=20import=20for=20importlib.resources,?=
 =?UTF-8?q?=20fix=20error=20message?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Pass `synthbanshee.data` module object to `resources.files()` instead
  of a hardcoded string (COPILOT-2).
- Rephrase error message to direct users to `_EMOTION_ALIASES` or
  reinstall, since bundled YAML isn't user-editable in site-packages
  (COPILOT-1).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 synthbanshee/cli.py             | 2 +-
 synthbanshee/config/taxonomy.py | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/synthbanshee/cli.py b/synthbanshee/cli.py
index 4ff77f9..80cd3d0 100644
--- a/synthbanshee/cli.py
+++ b/synthbanshee/cli.py
@@ -187,7 +187,7 @@ def _normalize_emotion(state: str) -> tuple[str, bool]:
         return canonical, True
     raise ValueError(
         f"Unknown emotional_state {state!r}. "
-        "Add it to synthbanshee/data/taxonomy.yaml or map it in _EMOTION_ALIASES in cli.py."
+        "Map it in _EMOTION_ALIASES in cli.py, or add it to taxonomy.yaml and reinstall."
     )
 
 
diff --git a/synthbanshee/config/taxonomy.py b/synthbanshee/config/taxonomy.py
index e487998..eb8f5ae 100644
--- a/synthbanshee/config/taxonomy.py
+++ b/synthbanshee/config/taxonomy.py
@@ -10,11 +10,13 @@
 
 import yaml
 
+import synthbanshee.data as _data_pkg
+
 
 @lru_cache(maxsize=1)
 def load_taxonomy() -> dict:
     """Load and cache the taxonomy YAML. Call this instead of hardcoding codes."""
-    ref = resources.files("synthbanshee.data").joinpath("taxonomy.yaml")
+    ref = resources.files(_data_pkg).joinpath("taxonomy.yaml")
     return yaml.safe_load(ref.read_text(encoding="utf-8"))
 
 

From 8a22f7640e5f8baade5b13d5819e002566aebd3c Mon Sep 17 00:00:00 2001
From: Shay Palachy <shaypal5@users.noreply.github.com>
Date: Sat, 2 May 2026 14:47:06 +0300
Subject: [PATCH 3/3] fix: remove unnecessary artifacts config, update stale
 doc references
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove `[tool.hatch.build] artifacts` — Hatch already includes
  non-py files inside declared packages by default (verified by
  inspecting built wheel).
- Update remaining `configs/taxonomy.yaml` references in
  docs/implementation_plan.md, docs/audio_generation_v3_design.md,
  and llms.txt.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 docs/audio_generation_v3_design.md | 4 ++--
 docs/implementation_plan.md        | 3 +--
 llms.txt                           | 2 +-
 pyproject.toml                     | 7 -------
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/docs/audio_generation_v3_design.md b/docs/audio_generation_v3_design.md
index b3f6b83..1edeb94 100644
--- a/docs/audio_generation_v3_design.md
+++ b/docs/audio_generation_v3_design.md
@@ -99,7 +99,7 @@ The scripts, label taxonomy, pipeline stage decomposition, and cache system are
 ## 3. What Does NOT Need to Change
 
 - **Script generation** (LLM + Jinja2 templates) — produces structurally correct dialogue; the problem is in rendering, not generation
-- **Label taxonomy** (`configs/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate
+- **Label taxonomy** (`synthbanshee/data/taxonomy.yaml`) — hierarchical PHYS/VERB/DIST/EMOT/ACOU/NONE is appropriate
 - **Pipeline stage decomposition** — the five stages are correctly separated
 - **TTS cache** (SHA-256 of full SSML string) — correct; will naturally miss on any SSML change
 - **Acoustic augmentation** (Tier B) — still needed and architecturally correct; runs after V3 audio assembly
@@ -386,7 +386,7 @@ This is not just a data cleanliness issue — an emotion-conditioned model that
 
 1. **Remove the silent downgrade.** `_normalize_emotion()` should log a warning and either (a) fail loudly for known invalid emotional states, or (b) map only to the nearest valid state with an explicit mapping table, not to `neutral` as a catch-all.
 
-2. **Extend `configs/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback.
+2. **Extend `synthbanshee/data/taxonomy.yaml` `emotional_states` list** to cover all emotional states that the LLM plausibly generates for each intensity level. Run the debug_run_1 script through the pipeline, collect all unique emotional states, and add any missing ones to the taxonomy before removing the fallback.
 
 3. **Add a QA check:** `qa.py` should flag any clip where the number of turns with emotional_state = `neutral` in the JSONL exceeds the number of turns with intensity ≤ 2 in the script. If I4 and I5 turns are being labeled as `neutral`, the normalization fallback has been triggered.
 
diff --git a/docs/implementation_plan.md b/docs/implementation_plan.md
index 363a40e..075df04 100644
--- a/docs/implementation_plan.md
+++ b/docs/implementation_plan.md
@@ -65,7 +65,6 @@ SynthBanshee/
 │   ├── speakers/                 # Speaker persona YAMLs
 │   ├── acoustic_scenes/          # Room/device YAML configs
 │   ├── run_configs/              # Full generation run definitions (RunConfig YAML)
-│   ├── taxonomy.yaml             # Label taxonomy — single source of truth
 │   └── examples/                 # Worked examples for each config type
 ├── assets/                       # Source assets (gitignored; populated at runtime)
 │   ├── speech/                   # TTS utterance cache (SHA-256 keyed)
@@ -239,7 +238,7 @@ Each template defines:
 
 **Acceptance criteria:** 50 scripts per project generated; zero transcript redundancy violations; 10% manual spot-check passes plausibility review.
 
-**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `configs/taxonomy.yaml` at import time.
+**✓ Complete** — `ScriptGenerator` in `synthbanshee/script/generator.py`; SHA-256 generation cache in `assets/scripts/`; output validated as `list[DialogueTurn]`; wired end-to-end into `_run_generate_pipeline()` in `cli.py`. `_TYPOLOGY_INTENSITY_MAP` in `cli.py` maps (typology, intensity) → taxonomy codes, validated against `synthbanshee/data/taxonomy.yaml` at import time.
 
 ### 1.4 Tier A Dataset Generation Run
 
diff --git a/llms.txt b/llms.txt
index 590dd94..3eee27d 100644
--- a/llms.txt
+++ b/llms.txt
@@ -54,7 +54,7 @@ Synthetic Hebrew audio dataset generator for two DataHack AI safety projects:
 - synthbanshee/config/taxonomy.py          — taxonomy loader helpers
 
 ## Config files (YAML)
-- configs/taxonomy.yaml                    — label taxonomy (source of truth for all label strings)
+- synthbanshee/data/taxonomy.yaml          — label taxonomy (source of truth for all label strings)
 - configs/examples/scene_*.yaml            — worked scene examples
 - configs/examples/speaker_*.yaml          — worked speaker persona examples (with rms_target_dbfs)
 - synthbanshee/config/acoustic_config.py   — room + device validity/source of truth (currently hardcoded sets; configs/acoustic_scenes/ is user-created if needed)
diff --git a/pyproject.toml b/pyproject.toml
index c16c845..c914b49 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -72,13 +72,6 @@ dev = [
 [project.scripts]
 synthbanshee = "synthbanshee.cli:cli"
 
-[tool.hatch.build]
-artifacts = [
-    "synthbanshee/**/*.yaml",
-    "synthbanshee/**/*.j2",
-    "synthbanshee/**/*.txt",
-]
-
 [tool.hatch.build.targets.wheel]
 packages = ["synthbanshee"]