# 04 — Phoneme Analysis

This notebook analyzes how Whisper's output maps to Dothraki phonemes.
We compare the IPA phonemes extracted from Whisper's transcription
against the known Dothraki IPA ground truth.

## Contents
1. [Phonemization Pipeline](#1-phonemization) — How words become phonemes
2. [Phoneme Distributions](#2-distributions) — Ground truth vs extracted
3. [Phoneme Confusion](#3-confusion) — Which sounds get mixed up?
4. [Articulatory Analysis](#4-articulatory) — Where in the mouth do errors happen?

In [None]:
import json
import sys
from pathlib import Path
from collections import Counter

import numpy as np
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

RESULTS_DIR = PROJECT_ROOT / 'data' / 'results'
LEXICON_PATH = PROJECT_ROOT / 'data' / 'lexicon' / 'dothraki_lexicon.json'
DIALOGUE_PATH = PROJECT_ROOT / 'data' / 'dialogue' / 'dothraki_dialogue.json'

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14, 6)
plt.rcParams['font.size'] = 12

eval_small = json.loads((RESULTS_DIR / 'batch_eval_small.json').read_text())
lexicon = json.loads(LEXICON_PATH.read_text())
dialogue = json.loads(DIALOGUE_PATH.read_text())

print(f'Loaded: {eval_small["num_clips"]} evaluated clips, {len(lexicon)} lexicon entries')

---
## 1. Phonemization Pipeline

The phonemization step converts Whisper's text output into IPA phonemes.
Since Whisper thinks it's hearing English (or other languages), we use
gruut/espeak-ng to extract the IPA for whatever language was detected.

```
Whisper output (text) → Language detection → gruut/espeak-ng → IPA phonemes
```

In [None]:
from pipeline.dothraki.phonemizer import phonemize_text, whisper_lang_to_gruut

# Demonstrate the phonemization pipeline on a few examples
results = eval_small['results']
nonempty = [r for r in results if r.get('whisper_text', '').strip() and 'error' not in r]

print(f'Phonemizing {min(8, len(nonempty))} sample transcriptions...\n')
print(f'{"Whisper Output":30s} {"Detected Lang":>12s}  {"Extracted IPA"}')
print('-' * 80)

for r in nonempty[:8]:
    lang = whisper_lang_to_gruut(r['whisper_lang'])
    phonemes = phonemize_text(r['whisper_text'][:50], lang=lang, whisper_lang=r['whisper_lang'])
    ipa_str = ' '.join(ipa for _, ipa in phonemes if ipa)
    print(f'{r["whisper_text"][:30]:30s} {r["whisper_lang"]:>12s}  {ipa_str[:40]}')

---
## 2. Phoneme Distributions

Comparing the phoneme frequency distribution from:
- **Ground truth:** The known Dothraki IPA transcriptions from the dialogue scripts
- **Whisper-derived:** IPA extracted from Whisper's output via phonemizer

In [None]:
# Extract ground truth phonemes from dialogue IPA
PHONEME_CHARS = set('abcdefghijklmnopqrstuvwxyzθðʃʒɾŋɣɔɛʔæɪʊɹɑə')

gt_phonemes = []
for r in results:
    if 'error' in r:
        continue
    ipa = r.get('gt_ipa', '')
    gt_phonemes.extend(c for c in ipa.lower() if c in PHONEME_CHARS)

# Extract Whisper-derived phonemes
whisper_phonemes = []
for r in nonempty[:100]:  # Use first 100 for speed
    lang = whisper_lang_to_gruut(r['whisper_lang'])
    phonemes = phonemize_text(r['whisper_text'], lang=lang, whisper_lang=r['whisper_lang'])
    for _, ipa in phonemes:
        if ipa:
            whisper_phonemes.extend(c for c in ipa.lower() if c in PHONEME_CHARS)

gt_freq = Counter(gt_phonemes).most_common(25)
whisper_freq = Counter(whisper_phonemes).most_common(25)

print(f'Ground truth phoneme tokens: {len(gt_phonemes)}')
print(f'Whisper-derived phoneme tokens: {len(whisper_phonemes)}')

In [None]:
# Side-by-side phoneme frequency comparison
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))

if gt_freq:
    labels_gt, counts_gt = zip(*gt_freq)
    ax1.bar(range(len(labels_gt)), counts_gt, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
    ax1.set_xticks(range(len(labels_gt)))
    ax1.set_xticklabels(labels_gt, fontsize=13)
    ax1.set_ylabel('Frequency')
    ax1.set_title('Ground Truth Dothraki Phonemes')

if whisper_freq:
    labels_w, counts_w = zip(*whisper_freq)
    ax2.bar(range(len(labels_w)), counts_w, color='#ff6b6b', edgecolor='#1a1a2e', alpha=0.8)
    ax2.set_xticks(range(len(labels_w)))
    ax2.set_xticklabels(labels_w, fontsize=13)
    ax2.set_ylabel('Frequency')
    ax2.set_title('Whisper-Derived Phonemes (via English phonemizer)')

fig.suptitle('Phoneme Frequency: Ground Truth vs Whisper Output', fontsize=15, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Dothraki-specific phonemes: which are present in GT but rare/absent in Whisper output?
gt_set = set(gt_phonemes)
whisper_set = set(whisper_phonemes)

gt_only = gt_set - whisper_set
whisper_only = whisper_set - gt_set
shared = gt_set & whisper_set

print(f'Phonemes in ground truth only:    {sorted(gt_only)}')
print(f'Phonemes in Whisper output only:  {sorted(whisper_only)}')
print(f'Shared phonemes:                  {len(shared)}')
print(f'\nDothraki phonemes Whisper misses: These are the sounds Whisper cannot')
print(f'recognize because they don\'t exist in the language it thinks it\'s hearing.')

---
## 3. Phoneme Confusion

Analyzing which Dothraki phonemes get confused with which language phonemes.
This is the core challenge: when Whisper hears /θ/ (Dothraki "th"),
does it map it to English /θ/ ("think") or something else entirely?

In [None]:
# Build a co-occurrence matrix: for each evaluated clip,
# compare GT phoneme distribution vs Whisper phoneme distribution

# Aggregate phoneme proportions
gt_counter = Counter(gt_phonemes)
whisper_counter = Counter(whisper_phonemes)

# Normalize to proportions
all_phonemes_union = sorted(gt_set | whisper_set)
gt_total = sum(gt_counter.values())
whisper_total = sum(whisper_counter.values())

gt_props = [gt_counter.get(p, 0) / gt_total for p in all_phonemes_union]
whisper_props = [whisper_counter.get(p, 0) / whisper_total for p in all_phonemes_union]

# Bar chart comparison
x = np.arange(len(all_phonemes_union))
width = 0.35

fig, ax = plt.subplots(figsize=(18, 6))
bars1 = ax.bar(x - width/2, gt_props, width, label='Ground Truth (Dothraki)', color='#4ecdc4', alpha=0.8)
bars2 = ax.bar(x + width/2, whisper_props, width, label='Whisper-Derived', color='#ff6b6b', alpha=0.8)

ax.set_xlabel('Phoneme')
ax.set_ylabel('Proportion')
ax.set_title('Phoneme Proportion: Ground Truth vs Whisper-Derived')
ax.set_xticks(x)
ax.set_xticklabels(all_phonemes_union, fontsize=12)
ax.legend()

plt.tight_layout()
plt.show()

# Find biggest divergences
divergences = [(p, gt_counter.get(p, 0)/gt_total - whisper_counter.get(p, 0)/whisper_total)
               for p in all_phonemes_union]
divergences.sort(key=lambda x: abs(x[1]), reverse=True)

print('\nBiggest phoneme divergences (GT proportion - Whisper proportion):')
for p, diff in divergences[:10]:
    direction = 'under-represented in Whisper' if diff > 0 else 'over-represented in Whisper'
    print(f'  /{p}/: {diff:+.4f} ({direction})')

---
## 4. Articulatory Analysis

Dothraki phonemes categorized by articulatory features.
This helps understand which *types* of sounds are hardest for Whisper.

In [None]:
# Categorize Dothraki phonemes by manner of articulation
MANNER_CATEGORIES = {
    'Plosives': list('pttkdgkqbg'),
    'Fricatives': list('fvsz') + ['θ', 'ð', 'ʃ', 'ʒ', 'x', 'ɣ', 'h'],
    'Nasals': list('mn') + ['ŋ'],
    'Approximants': list('wjlr') + ['ɾ', 'ɹ'],
    'Vowels': list('aeiou') + ['ɔ', 'ɛ', 'æ', 'ɪ', 'ʊ', 'ɑ', 'ə'],
}

# Count GT phonemes by category
gt_by_manner = {}
whisper_by_manner = {}
for category, phones in MANNER_CATEGORIES.items():
    gt_by_manner[category] = sum(gt_counter.get(p, 0) for p in phones)
    whisper_by_manner[category] = sum(whisper_counter.get(p, 0) for p in phones)

fig, ax = plt.subplots(figsize=(10, 6))
categories = list(MANNER_CATEGORIES.keys())
x = np.arange(len(categories))
width = 0.35

gt_vals = [gt_by_manner[c] / gt_total * 100 for c in categories]
w_vals = [whisper_by_manner[c] / whisper_total * 100 for c in categories]

ax.bar(x - width/2, gt_vals, width, label='Ground Truth', color='#4ecdc4', alpha=0.8)
ax.bar(x + width/2, w_vals, width, label='Whisper-Derived', color='#ff6b6b', alpha=0.8)

ax.set_xlabel('Manner of Articulation')
ax.set_ylabel('Percentage of Total Phonemes')
ax.set_title('Phoneme Category Distribution: GT vs Whisper')
ax.set_xticks(x)
ax.set_xticklabels(categories)
ax.legend()

plt.tight_layout()
plt.show()

print('\nInsight: Differences in category distribution reveal systematic biases.')
print('E.g., if Whisper over-represents vowels, it may be hallucinating')
print('vowel-heavy English words to fill gaps in Dothraki consonant clusters.')

In [None]:
# Dothraki phoneme inventory from the lexicon
all_lex_ipa = ''.join(e['ipa'] for e in lexicon if e.get('ipa'))
lex_phonemes = [c for c in all_lex_ipa.lower() if c in PHONEME_CHARS]
lex_unique = sorted(set(lex_phonemes))

print('Dothraki Phoneme Inventory (from lexicon):')
print(f'  Total unique: {len(lex_unique)}')

consonants = [p for p in lex_unique if p not in 'aeiouɔɛæɪʊɑə']
vowels = [p for p in lex_unique if p in 'aeiouɔɛæɪʊɑə']
print(f'  Consonants ({len(consonants)}): {" ".join(consonants)}')
print(f'  Vowels ({len(vowels)}): {" ".join(vowels)}')

# Which Dothraki sounds don't exist in English?
english_phonemes = set('pbmfvθðtdnszʃʒlɹjkgŋhwaeɪɛæəɑɔʊu')
dothraki_non_english = set(lex_unique) - english_phonemes
print(f'\n  Non-English Dothraki sounds: {sorted(dothraki_non_english)}')
print(f'  These are the hardest for Whisper to handle since it expects English.')