# 01 — Data Collection & Exploration

This notebook documents the data sources, collection process, and exploratory analysis
for the Dothraki ASR project.

## Contents
1. [Dothraki Lexicon](#1-dothraki-lexicon) — 1,234 words from the official dictionary
2. [Master Dialogue Scripts](#2-master-dialogue-scripts) — 1,712 lines from David Peterson's GoT scripts
3. [Audio Data](#3-audio-data) — Synthetic (espeak-ng) + real (YouTube) clips
4. [Exploratory Statistics](#4-exploratory-statistics) — Vocabulary distribution, phoneme frequency, etc.

In [None]:
import json
import sys
from pathlib import Path
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np

# Project paths
PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

LEXICON_PATH = PROJECT_ROOT / 'data' / 'lexicon' / 'dothraki_lexicon.json'
DIALOGUE_PATH = PROJECT_ROOT / 'data' / 'dialogue' / 'dothraki_dialogue.json'
MANIFEST_PATH = PROJECT_ROOT / 'data' / 'synthetic' / 'manifest.json'

# Plot style
plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 12

---
## 1. Dothraki Lexicon

**Source:** [The Dothraki Language Dictionary (ver 3.11)](https://docs.dothraki.org/Dothraki.pdf)  
**Author:** Richard Littauer (Lajaki), based on David J. Peterson's work  
**Extraction:** `scripts/build_lexicon.py` using pymupdf (fitz)  

The lexicon was extracted from the official 25-page PDF dictionary. Each entry contains:
- Dothraki word
- IPA pronunciation
- Part of speech
- English translation

In [None]:
lexicon = json.loads(LEXICON_PATH.read_text())
print(f'Total lexicon entries: {len(lexicon)}')
print(f'\nSample entries:')
for entry in lexicon[:5]:
    print(f"  {entry['word']:20s} [{entry['ipa']:20s}] {entry['part_of_speech']:20s} → {entry['english'][:50]}")

In [None]:
# Part of speech distribution
pos_counts = Counter(e['part_of_speech'] for e in lexicon)
pos_sorted = pos_counts.most_common()

fig, ax = plt.subplots(figsize=(12, 5))
labels, counts = zip(*pos_sorted)
colors = plt.cm.viridis(np.linspace(0.2, 0.8, len(labels)))
bars = ax.barh(range(len(labels)), counts, color=colors)
ax.set_yticks(range(len(labels)))
ax.set_yticklabels(labels)
ax.set_xlabel('Count')
ax.set_title('Dothraki Lexicon — Part of Speech Distribution')
ax.invert_yaxis()

for bar, count in zip(bars, counts):
    ax.text(bar.get_width() + 3, bar.get_y() + bar.get_height()/2,
            str(count), va='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
# Word length distribution
word_lengths = [len(e['word']) for e in lexicon]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.hist(word_lengths, bins=range(1, max(word_lengths)+1), color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
ax1.set_xlabel('Word Length (characters)')
ax1.set_ylabel('Count')
ax1.set_title('Dothraki Word Length Distribution')
ax1.axvline(np.mean(word_lengths), color='#ff6b6b', linestyle='--', label=f'Mean: {np.mean(word_lengths):.1f}')
ax1.legend()

# IPA phoneme frequency
all_ipa = ''.join(e['ipa'] for e in lexicon)
# Filter to actual phoneme characters (skip spaces, dots, stress marks)
phonemes = [c for c in all_ipa if c.isalpha() or c in 'θðʃʒɾŋɣɔɛʔ']
phoneme_counts = Counter(phonemes).most_common(20)

labels_p, counts_p = zip(*phoneme_counts)
ax2.bar(range(len(labels_p)), counts_p, color='#ff6b6b', edgecolor='#1a1a2e', alpha=0.8)
ax2.set_xticks(range(len(labels_p)))
ax2.set_xticklabels(labels_p, fontsize=14)
ax2.set_xlabel('Phoneme')
ax2.set_ylabel('Frequency')
ax2.set_title('Top 20 Phonemes in Dothraki Lexicon (IPA)')

plt.tight_layout()
plt.show()

print(f'Unique phonemes in lexicon: {len(set(phonemes))}')
print(f'Total phoneme tokens: {len(phonemes)}')

---
## 2. Master Dialogue Scripts

**Source:** David J. Peterson's official master dialogue documents  
- [Seasons 1-2](https://dedalvs.com/work/game-of-thrones/game_of_thrones_master_dialogue_s1s2.pdf) (197 pages)  
- [Seasons 3-8](https://dedalvs.com/work/game-of-thrones/game_of_thrones_master_dialogue.pdf) (393 pages)  

**Extraction:** `scripts/parse_dialogue.py` using pymupdf  

Each entry contains the exact Dothraki text, IPA transcription, interlinear gloss,
English translation, and scene/speaker metadata. This serves as our **ground truth**
for evaluating the ASR pipeline.

In [None]:
dialogue = json.loads(DIALOGUE_PATH.read_text())
print(f'Total dialogue entries: {len(dialogue)}')

# Split by source
by_source = Counter(e['source'] for e in dialogue)
print(f'\nBy source:')
for source, count in by_source.items():
    print(f'  {source}: {count} entries')

print(f'\nSample dialogue entry:')
sample = dialogue[4]
for key, val in sample.items():
    print(f'  {key:12s}: {str(val)[:80]}')

In [None]:
# Dialogue length distribution
dothraki_lengths = [len(e['dothraki']) for e in dialogue]
english_lengths = [len(e['english']) for e in dialogue]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.hist(dothraki_lengths, bins=50, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
ax1.set_xlabel('Length (characters)')
ax1.set_ylabel('Count')
ax1.set_title('Dothraki Dialogue Length Distribution')
ax1.axvline(np.mean(dothraki_lengths), color='#ff6b6b', linestyle='--',
            label=f'Mean: {np.mean(dothraki_lengths):.0f} chars')
ax1.legend()

ax2.hist(english_lengths, bins=50, color='#ff6b6b', edgecolor='#1a1a2e', alpha=0.8)
ax2.set_xlabel('Length (characters)')
ax2.set_ylabel('Count')
ax2.set_title('English Translation Length Distribution')
ax2.axvline(np.mean(english_lengths), color='#4ecdc4', linestyle='--',
            label=f'Mean: {np.mean(english_lengths):.0f} chars')
ax2.legend()

plt.tight_layout()
plt.show()

# Ratio of Dothraki to English length
ratios = [len(e['dothraki']) / max(len(e['english']), 1) for e in dialogue]
print(f'Avg Dothraki/English length ratio: {np.mean(ratios):.2f}')
print(f'(Dothraki tends to be {"longer" if np.mean(ratios) > 1 else "shorter"} than its English translation)')

In [None]:
# Most common Dothraki words across all dialogue
all_words = []
for e in dialogue:
    # Simple whitespace tokenization, strip punctuation
    words = e['dothraki'].lower().replace('!', '').replace('?', '').replace('.', '').replace(',', '').split()
    all_words.extend(words)

word_freq = Counter(all_words).most_common(25)

fig, ax = plt.subplots(figsize=(14, 6))
words, freqs = zip(*word_freq)
ax.bar(range(len(words)), freqs, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
ax.set_xticks(range(len(words)))
ax.set_xticklabels(words, rotation=45, ha='right', fontsize=11)
ax.set_ylabel('Frequency')
ax.set_title('Top 25 Most Common Words in Dothraki Dialogue')
plt.tight_layout()
plt.show()

print(f'Total word tokens in dialogue: {len(all_words)}')
print(f'Unique words: {len(set(all_words))}')

---
## 3. Audio Data

We have two types of audio:

### Synthetic Audio (espeak-ng)
Generated from the IPA transcriptions in the master dialogue using espeak-ng.
Each clip is a single dialogue line, perfectly aligned to its ground truth.

### Real Audio (YouTube)
Two clips from Game of Thrones:
1. **Khal Drogo's gift to Rhaego** — the famous iron throne speech (S1E07)
2. **Dothraki short clip** — shorter dialogue excerpt

In [None]:
manifest = json.loads(MANIFEST_PATH.read_text())
print(f'Synthetic audio clips: {len(manifest)}')

# Check audio file sizes
synthetic_dir = PROJECT_ROOT / 'data' / 'synthetic'
sizes = []
for entry in manifest:
    wav_path = synthetic_dir / entry['audio_file']
    if wav_path.exists():
        sizes.append(wav_path.stat().st_size / 1024)  # KB

if sizes:
    print(f'Audio file sizes: min={min(sizes):.0f}KB, max={max(sizes):.0f}KB, mean={np.mean(sizes):.0f}KB')
    print(f'Total synthetic audio: {sum(sizes)/1024:.1f}MB')

    fig, ax = plt.subplots(figsize=(12, 4))
    ax.hist(sizes, bins=50, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
    ax.set_xlabel('File Size (KB)')
    ax.set_ylabel('Count')
    ax.set_title('Synthetic Audio File Size Distribution')
    plt.tight_layout()
    plt.show()
else:
    print('(Synthetic WAV files not found — run scripts/synthesize_audio.py to generate)')

# Real clips
raw_dir = PROJECT_ROOT / 'data' / 'raw'
print(f'\nReal audio clips:')
for wav in sorted(raw_dir.glob('*.wav')):
    size_mb = wav.stat().st_size / (1024 * 1024)
    print(f'  {wav.name}: {size_mb:.1f}MB')

---
## 4. Exploratory Statistics

Key properties of the Dothraki language relevant to ASR:

In [None]:
# Dothraki phoneme inventory from the lexicon
all_ipa_dialogue = ''.join(e['ipa'] for e in dialogue)
dialogue_phonemes = [c for c in all_ipa_dialogue if c.isalpha() or c in 'θðʃʒɾŋɣɔɛʔ']
unique_phonemes = sorted(set(dialogue_phonemes))

print('Dothraki Phoneme Inventory (from dialogue IPA):')
print(f'  Total unique phonemes: {len(unique_phonemes)}')
print(f'  Consonants: {[p for p in unique_phonemes if p not in "aeiouɔɛ"]}')
print(f'  Vowels: {[p for p in unique_phonemes if p in "aeiouɔɛ"]}')

# Compare with languages Whisper knows well
print(f'\nFor context:')
print(f'  English has ~44 phonemes')
print(f'  Spanish has ~25 phonemes')
print(f'  Arabic has ~28 consonants + 6 vowels')
print(f'  Dothraki has {len(unique_phonemes)} (as represented in the dialogue scripts)')

In [None]:
# Summary statistics
print('='*60)
print('DATA COLLECTION SUMMARY')
print('='*60)
print(f'Lexicon entries:        {len(lexicon):,}')
print(f'Dialogue entries:       {len(dialogue):,}')
print(f'Synthetic audio clips:  {len(manifest):,}')
print(f'Real audio clips:       {len(list(raw_dir.glob("*.wav")))}')
print(f'Unique words (lexicon): {len(lexicon):,}')
print(f'Unique words (dialogue):{len(set(all_words)):,}')
print(f'Total word tokens:      {len(all_words):,}')
print(f'Unique phonemes:        {len(unique_phonemes)}')
print('='*60)