# 05 — Dothraki Matching Engine

This notebook explores the core matching algorithm that maps Whisper's
phoneme output to actual Dothraki words from the lexicon.

## Contents
1. [The Matching Problem](#1-the-problem) — Why this is hard
2. [IPA Edit Distance](#2-ipa-edit-distance) — Our primary matching approach
3. [Match Quality Analysis](#3-match-quality) — How good are the matches?
4. [End-to-End Examples](#4-e2e-examples) — Full pipeline walkthroughs
5. [Failure Analysis](#5-failure-analysis) — Where matching breaks down

In [None]:
import json
import sys
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

RESULTS_DIR = PROJECT_ROOT / 'data' / 'results'

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 12

from pipeline.dothraki.matcher import DothrakiMatcher, _normalize_ipa, _levenshtein
from pipeline.dothraki.phonemizer import phonemize_text, whisper_lang_to_gruut
from pipeline.dothraki.translator import Translator

matcher = DothrakiMatcher()
translator = Translator()

eval_small = json.loads((RESULTS_DIR / 'batch_eval_small.json').read_text())
results = eval_small['results']
nonempty = [r for r in results if r.get('whisper_text', '').strip() and 'error' not in r]

print(f'Matcher loaded: {len(matcher._entries)} lexicon entries')
print(f'Non-empty transcriptions to analyze: {len(nonempty)}')

---
## 1. The Matching Problem

The challenge: Whisper outputs text in a **wrong language** (usually English).
We must figure out which Dothraki word the speaker actually said.

```
Audio: [Dothraki speech] → Whisper: "to throw key" (English hallucination)
                                         ↓
                              Phonemize: /tə θɹoʊ ki/
                                         ↓
                              Match vs lexicon: "dothrakhqoyi" (/doθɾakqoji/) ?
```

The key insight: even though the *words* are wrong, the *sounds* partially overlap.

In [None]:
# Demonstrate the matching on a single word
test_words = ['hello', 'throw', 'dragon', 'king', 'blood', 'horse', 'fight']

print(f'{"English Word":15s} {"IPA":15s} {"Top Match":15s} {"Match IPA":15s} {"Score":>6s} {"English"}')
print('-' * 90)

for word in test_words:
    candidates = matcher.match_word(word, lang='en-us', top_k=1)
    if candidates:
        c = candidates[0]
        # Get the IPA we generated for the English word
        phonemes = phonemize_text(word, lang='en-us')
        word_ipa = phonemes[0][1] if phonemes and phonemes[0][1] else '?'
        print(f'{word:15s} {word_ipa:15s} {c.word:15s} {c.ipa:15s} {c.score:6.3f} {c.english[:30]}')

---
## 2. IPA Edit Distance

We use **normalized Levenshtein distance** on IPA strings.
This measures how many character insertions, deletions, or substitutions
are needed to transform one IPA string into another.

Score = `1 - (edit_distance / max_length)`

A score of 1.0 = perfect match, 0.0 = completely different.

In [None]:
# Visualize edit distance between example pairs
pairs = [
    ('doθɾak', 'doθɾaki'),    # near-identical
    ('khal', 'xal'),           # Dothraki kh vs x
    ('ðɾoʊ', 'doθɾak'),       # English "throw" vs Dothraki
    ('hɛloʊ', 'hajo'),        # hello vs hajo
    ('blaːd', 'qoj'),         # unrelated
]

print(f'{"IPA A":15s} {"IPA B":15s} {"Edit Dist":>10s} {"Score":>8s}')
print('-' * 55)
for a, b in pairs:
    na, nb = _normalize_ipa(a), _normalize_ipa(b)
    dist = _levenshtein(na, nb)
    score = 1 - dist / max(len(na), len(nb), 1)
    print(f'{a:15s} {b:15s} {dist:10d} {score:8.3f}')

In [None]:
# Distribution of top-1 match scores across all evaluated clips
all_scores = []
all_top_words = []

for r in nonempty:
    lang = whisper_lang_to_gruut(r['whisper_lang'])
    match_results = matcher.match_text(
        r['whisper_text'], lang=lang, top_k=1, whisper_lang=r['whisper_lang']
    )
    for word_result in match_results:
        if word_result['matches']:
            best = word_result['matches'][0]
            all_scores.append(best.score)
            all_top_words.append(best.word)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

# Score distribution
ax1.hist(all_scores, bins=50, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
ax1.axvline(0.4, color='#ff6b6b', linestyle='--', label='Confidence threshold (0.4)')
ax1.set_xlabel('Match Score')
ax1.set_ylabel('Count')
ax1.set_title('Distribution of Top-1 Match Scores')
ax1.legend()

# Most frequently matched Dothraki words
word_freq = Counter(all_top_words).most_common(15)
words, freqs = zip(*word_freq)
ax2.barh(range(len(words)), freqs, color='#ff6b6b', edgecolor='#1a1a2e', alpha=0.8)
ax2.set_yticks(range(len(words)))
ax2.set_yticklabels(words)
ax2.set_xlabel('Frequency')
ax2.set_title('Most Frequently Matched Dothraki Words')
ax2.invert_yaxis()

plt.tight_layout()
plt.show()

above_threshold = sum(1 for s in all_scores if s >= 0.4)
print(f'\nMatches above 0.4 threshold: {above_threshold}/{len(all_scores)} ({above_threshold/len(all_scores)*100:.1f}%)')
print(f'Mean score: {np.mean(all_scores):.3f}, Median: {np.median(all_scores):.3f}')

---
## 3. Match Quality Analysis

Examining how well the matcher finds the correct Dothraki word
when we know the ground truth.

In [None]:
# For each clip, check if ANY of the ground truth Dothraki words
# appear in the top-k matches
top_k_values = [1, 3, 5, 10]
hit_rates = {k: 0 for k in top_k_values}
total_checked = 0

for r in nonempty:
    gt_words = set(r['gt_dothraki'].lower().replace('!', '').replace('?', '').replace('.', '').replace(',', '').split())
    if not gt_words:
        continue

    lang = whisper_lang_to_gruut(r['whisper_lang'])
    match_results = matcher.match_text(
        r['whisper_text'], lang=lang, top_k=max(top_k_values), whisper_lang=r['whisper_lang']
    )

    # Collect all matched Dothraki words at each k
    for k in top_k_values:
        matched_words = set()
        for word_result in match_results:
            for m in word_result['matches'][:k]:
                matched_words.add(m.word.lower())
        if gt_words & matched_words:
            hit_rates[k] += 1

    total_checked += 1

fig, ax = plt.subplots(figsize=(8, 5))
k_labels = [f'Top-{k}' for k in top_k_values]
rates = [hit_rates[k] / total_checked * 100 for k in top_k_values]

bars = ax.bar(k_labels, rates, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
for bar, rate in zip(bars, rates):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 1,
            f'{rate:.1f}%', ha='center', fontsize=12)

ax.set_ylabel('Hit Rate (%)')
ax.set_title('Ground Truth Word Found in Top-K Matches')
ax.set_ylim(0, 100)

plt.tight_layout()
plt.show()

print(f'Clips checked: {total_checked}')
for k in top_k_values:
    print(f'  Top-{k}: {hit_rates[k]}/{total_checked} ({hit_rates[k]/total_checked*100:.1f}%)')

---
## 4. End-to-End Examples

Walking through the full pipeline for specific clips to show
exactly how matching and translation work together.

In [None]:
# Detailed walkthrough of 5 examples
for i, r in enumerate(nonempty[:5]):
    print(f'\n{"="*70}')
    print(f'Example {i+1}')
    print(f'{"="*70}')
    print(f'Ground Truth (Dothraki):  {r["gt_dothraki"]}')
    print(f'Ground Truth (English):   {r["gt_english"]}')
    print(f'Whisper Output ({r["whisper_lang"]}):     {r["whisper_text"][:80]}')

    lang = whisper_lang_to_gruut(r['whisper_lang'])
    match_results = matcher.match_text(
        r['whisper_text'][:80], lang=lang, top_k=3, whisper_lang=r['whisper_lang']
    )

    print(f'\n  Per-word matching:')
    for word_result in match_results[:6]:  # Show first 6 words
        word = word_result['word']
        ipa = word_result['ipa'] or '(no IPA)'
        matches = word_result['matches']
        if matches:
            best = matches[0]
            print(f'    "{word}" [{ipa}] → {best.word} [{best.ipa}] ({best.score:.3f}) = "{best.english}"')

    # Run through translator
    translation = translator.translate(match_results)
    print(f'\n  Pipeline Translation: {translation.translation}')

---
## 5. Failure Analysis

Understanding where and why the matching algorithm fails.

In [None]:
# Categorize match quality
score_buckets = {'High (>0.6)': 0, 'Medium (0.4-0.6)': 0, 'Low (<0.4)': 0}
for s in all_scores:
    if s > 0.6:
        score_buckets['High (>0.6)'] += 1
    elif s >= 0.4:
        score_buckets['Medium (0.4-0.6)'] += 1
    else:
        score_buckets['Low (<0.4)'] += 1

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart of score quality
colors_pie = ['#4ecdc4', '#ffd93d', '#ff6b6b']
ax1.pie(score_buckets.values(), labels=score_buckets.keys(), autopct='%1.1f%%',
        colors=colors_pie, textprops={'fontsize': 12})
ax1.set_title('Match Score Quality Distribution')

# Score by detected language
lang_scores = {}
for r in nonempty:
    lang = r['whisper_lang']
    match_results = matcher.match_text(
        r['whisper_text'][:50], lang=whisper_lang_to_gruut(lang),
        top_k=1, whisper_lang=lang
    )
    scores = [wr['matches'][0].score for wr in match_results if wr['matches']]
    if scores:
        if lang not in lang_scores:
            lang_scores[lang] = []
        lang_scores[lang].extend(scores)

langs_sorted = sorted(lang_scores.keys(), key=lambda l: -np.mean(lang_scores[l]))
box_data = [lang_scores[l] for l in langs_sorted]
bp = ax2.boxplot(box_data, labels=langs_sorted, patch_artist=True)
for patch in bp['boxes']:
    patch.set_facecolor('#4ecdc4')
    patch.set_alpha(0.7)
ax2.set_ylabel('Match Score')
ax2.set_title('Match Score by Whisper Detected Language')
ax2.axhline(0.4, color='#ff6b6b', linestyle='--', alpha=0.7, label='Threshold')
ax2.legend()

plt.tight_layout()
plt.show()

print('Mean match score by detected language:')
for lang in langs_sorted:
    print(f'  {lang}: {np.mean(lang_scores[lang]):.3f} (n={len(lang_scores[lang])})')

In [None]:
# Summary
print('='*60)
print('MATCHING ENGINE SUMMARY')
print('='*60)
print(f'Algorithm:               IPA Edit Distance (Levenshtein)')
print(f'Lexicon size:            {len(matcher._entries)} entries')
print(f'Clips analyzed:          {len(nonempty)}')
print(f'Total word matches:      {len(all_scores)}')
print(f'Mean match score:        {np.mean(all_scores):.3f}')
print(f'Above threshold (0.4):   {above_threshold}/{len(all_scores)} ({above_threshold/len(all_scores)*100:.1f}%)')
print(f'\nKey limitations:')
print(f'  - Only uses IPA edit distance (no articulatory features)')
print(f'  - Equal substitution cost for all phoneme pairs')
print(f'  - No context window (matches word by word independently)')
print(f'  - Lexicon coverage limited to 1,234 known Dothraki words')
print('='*60)