Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/m17_phase_a_validation_report.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# M17 Phase A Validation Report — Whisper + UTMOS on Hebrew Clips

This report validates the Phase A acceptance criteria from the M17 design doc
(`docs/automated_eval_design.md` lines 418–422) before the E1/E2 module skeletons land.
(`docs/automated_eval_design.md` §"Acceptance Criteria") before the E1/E2 module skeletons land.

Generated: 2026-05-05. Raw data: `state/spikes/m17_phase_a/results.json` (gitignored). Auto-tables: `state/spikes/m17_phase_a/report_auto.md`. Listening samples: `state/spikes/m17_phase_a/<clip_id>__<degradation>.wav`. SHA-256 prefixes shown in the manifest are first-8-chars only — full hashes in `results.json`.

Expand Down
25 changes: 17 additions & 8 deletions scripts/m17_phase_a_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,16 +411,21 @@ def write_auto_report(results: dict) -> None:
md.append(f"| {cid} | {score:.3f} |")
md.append("")
md.append(
f"clean mean: **{results['utmos']['clean_mean']:.3f}** (n={len(results['utmos']['clean'])})"
f"clean mean (all): **{results['utmos']['clean_mean']:.3f}** "
f"(n={len(results['utmos']['clean'])}) · "
f"clean mean (degradation sample, n={len(results['utmos']['degradation_sample_clip_ids'])}): "
f"**{results['utmos']['sample_clean_mean']:.3f}**"
)
md.append("")
md.append("## UTMOS — degradation sweep")
md.append("")
md.append("| Degradation | mean UTMOS | clean − deg | direction |")
md.append("|---|---|---|---|")
clean_mean = results["utmos"]["clean_mean"]
# Paired comparison — degraded means cover only the degradation-sample clips,
# so we subtract from the sample-matched clean mean, not the all-clip mean.
sample_clean_mean = results["utmos"]["sample_clean_mean"]
for d in results["utmos"]["degradations"]:
diff = clean_mean - d["mean_utmos"]
diff = sample_clean_mean - d["mean_utmos"]
direction = "↓ as expected" if diff > 0 else "↑ INVERTED"
md.append(
f"| `{d['id']}` ({d['kind']}) | {d['mean_utmos']:.3f} | {diff:+.3f} | {direction} |"
Expand Down Expand Up @@ -557,14 +562,16 @@ def main() -> None:
}
)

# Gate: original gate uses the white-noise -10 dB SNR baseline (the
# design's implicit reference). We also report whether ANY degradation in
# the sweep meets the 0.5 separation threshold.
# Gate: paired comparison between the SAME 5 clips clean and degraded.
# `clean_mean` (n=10) is kept in the JSON as a population baseline, but the
# gate must use `sample_clean_mean` to avoid mixing populations — degraded
# means are over `sample_clips` only.
sample_clean_mean = float(np.mean([clean_scores[c.clip_id] for c in sample_clips]))
primary_deg = next(d for d in degradation_results if d["id"] == "wn_snr_-10db")
primary_separation = clean_mean - primary_deg["mean_utmos"]
primary_separation = sample_clean_mean - primary_deg["mean_utmos"]
primary_gate = "PASS" if primary_separation >= UTMOS_SEPARATION_GATE else "FAIL"
any_passes = any(
(clean_mean - d["mean_utmos"]) >= UTMOS_SEPARATION_GATE for d in degradation_results
(sample_clean_mean - d["mean_utmos"]) >= UTMOS_SEPARATION_GATE for d in degradation_results
)
monotonic_in_severity = is_monotonic_with_severity(
[d for d in degradation_results if d["kind"] == "white_noise"]
Expand Down Expand Up @@ -599,6 +606,8 @@ def main() -> None:
"utmos": {
"clean": clean_scores,
"clean_mean": clean_mean,
"sample_clean_mean": sample_clean_mean,
"degradation_sample_clip_ids": [c.clip_id for c in sample_clips],
"degradations": degradation_results,
"primary_separation_db_-10_white_noise": primary_separation,
"primary_gate": primary_gate,
Expand Down
Loading