# Baseline vs. Enhanced Decoder Metrics

This notebook visualises word error rate (WER) and character accuracy for multiple decoding pipelines. Populate the metric files from `eval_competition.py` or your Weights & Biases exports, then re-run the cells below to refresh the plots.

## Expected inputs

* `metrics/baseline_metrics.json` — dictionary with keys `wer` and `char_accuracy`.
* `metrics/enhanced_metrics.json` — same structure for the LM-augmented or curriculum-trained model.

If the files are missing the notebook will fall back to illustrative values so the plotting code still runs. Update `METRIC_SPECS` if you log additional systems.

In [None]:
import json
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
def load_metrics(path: Path, fallback: Dict[str, float]) -> Dict[str, float]:
    if path.exists():
        with path.open("r", encoding="utf-8") as handle:
            data = json.load(handle)
        return {
            "wer": float(data.get("wer", fallback["wer"])),
            "char_accuracy": float(data.get("char_accuracy", fallback["char_accuracy"])),
        }
    return fallback

METRIC_SPECS = {
    "Baseline": (Path("metrics/baseline_metrics.json"), {"wer": 0.285, "char_accuracy": 0.86}),
    "Enhanced (Curriculum + LM)": (Path("metrics/enhanced_metrics.json"), {"wer": 0.192, "char_accuracy": 0.91}),
}

records = []
for label, (metric_path, fallback) in METRIC_SPECS.items():
    metrics = load_metrics(metric_path, fallback)
    metrics["system"] = label
    records.append(metrics)

df = pd.DataFrame.from_records(records)
df

In [None]:
plt.style.use("seaborn-v0_8")
fig, ax1 = plt.subplots(figsize=(8, 5))
x = np.arange(len(df))
width = 0.35

wer_bars = ax1.bar(x - width/2, df["wer"], width, label="WER", color="tab:red")
ax1.set_ylabel("Word Error Rate")
ax1.set_ylim(0, max(df["wer"]) * 1.2)

ax2 = ax1.twinx()
acc_bars = ax2.bar(x + width/2, df["char_accuracy"], width, label="Char Accuracy", color="tab:blue")
ax2.set_ylabel("Character Accuracy")
ax2.set_ylim(0, 1.05)

ax1.set_xticks(x)
ax1.set_xticklabels(df["system"], rotation=15, ha="right")
ax1.set_title("Decoder Performance Comparison")

fig.tight_layout()
fig.legend(loc="upper center", ncol=2, bbox_to_anchor=(0.5, 1.08))
plt.show()