# 02 — Audio Preprocessing

This notebook walks through the audio preprocessing pipeline:
vocal isolation with Demucs, normalization, and synthetic audio generation.

## Contents
1. [Synthetic Audio Overview](#1-synthetic-audio) — espeak-ng generated clips
2. [Audio Quality Analysis](#2-audio-quality) — Waveforms, spectrograms
3. [Real Audio Processing](#3-real-audio) — Demucs vocal isolation before/after
4. [Comparison](#4-comparison) — Synthetic vs real audio characteristics

In [None]:
import json
import sys
from pathlib import Path

import numpy as np
import matplotlib.pyplot as plt

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

SYNTHETIC_DIR = PROJECT_ROOT / 'data' / 'synthetic'
RAW_DIR = PROJECT_ROOT / 'data' / 'raw'
PROCESSED_DIR = PROJECT_ROOT / 'data' / 'processed'

plt.style.use('dark_background')
plt.rcParams['figure.figsize'] = (14, 5)
plt.rcParams['font.size'] = 12

manifest = json.loads((SYNTHETIC_DIR / 'manifest.json').read_text())
print(f'Loaded manifest: {len(manifest)} synthetic clips')

---
## 1. Synthetic Audio

All 1,712 dialogue entries were synthesized from IPA transcriptions using espeak-ng.
Each clip is a single Dothraki line, rendered as 16kHz mono WAV.

In [None]:
# Catalog synthetic audio files and their properties
import soundfile as sf

durations = []
file_sizes = []
sample_rates = []

for entry in manifest:
    wav_path = SYNTHETIC_DIR / entry['audio_file']
    if wav_path.exists():
        info = sf.info(str(wav_path))
        durations.append(info.duration)
        file_sizes.append(wav_path.stat().st_size / 1024)
        sample_rates.append(info.samplerate)

print(f'Clips analyzed: {len(durations)}')
print(f'Sample rate: {set(sample_rates)}')
print(f'Duration: min={min(durations):.2f}s, max={max(durations):.2f}s, mean={np.mean(durations):.2f}s')
print(f'Total audio: {sum(durations)/60:.1f} minutes')
print(f'Total size: {sum(file_sizes)/1024:.1f} MB')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.hist(durations, bins=50, color='#4ecdc4', edgecolor='#1a1a2e', alpha=0.8)
ax1.set_xlabel('Duration (seconds)')
ax1.set_ylabel('Count')
ax1.set_title('Synthetic Clip Duration Distribution')
ax1.axvline(np.mean(durations), color='#ff6b6b', linestyle='--',
            label=f'Mean: {np.mean(durations):.2f}s')
ax1.legend()

ax2.hist(file_sizes, bins=50, color='#ff6b6b', edgecolor='#1a1a2e', alpha=0.8)
ax2.set_xlabel('File Size (KB)')
ax2.set_ylabel('Count')
ax2.set_title('Synthetic Clip File Size Distribution')
ax2.axvline(np.mean(file_sizes), color='#4ecdc4', linestyle='--',
            label=f'Mean: {np.mean(file_sizes):.0f}KB')
ax2.legend()

plt.tight_layout()
plt.show()

In [None]:
# Visualize a sample synthetic waveform and spectrogram
sample_entry = manifest[4]  # "Khal vezhvén! M'athchomaroón!"
sample_path = SYNTHETIC_DIR / sample_entry['audio_file']

data, sr = sf.read(str(sample_path))
time_axis = np.arange(len(data)) / sr

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))

# Waveform
ax1.plot(time_axis, data, color='#4ecdc4', alpha=0.8, linewidth=0.5)
ax1.set_xlabel('Time (s)')
ax1.set_ylabel('Amplitude')
ax1.set_title(f'Waveform: "{sample_entry["dothraki"]}"')
ax1.set_xlim(0, time_axis[-1])

# Spectrogram
ax2.specgram(data, NFFT=512, Fs=sr, noverlap=256, cmap='magma')
ax2.set_xlabel('Time (s)')
ax2.set_ylabel('Frequency (Hz)')
ax2.set_title('Spectrogram')
ax2.set_ylim(0, 8000)

plt.tight_layout()
plt.show()

print(f'Ground truth: {sample_entry["dothraki"]}')
print(f'IPA: {sample_entry["ipa"]}')
print(f'English: {sample_entry["english"]}')
print(f'Duration: {len(data)/sr:.2f}s, samples: {len(data)}')

---
## 2. Audio Quality Analysis

Comparing spectral characteristics across multiple clips to understand
the consistency and quality of synthesized Dothraki speech.

In [None]:
# Compare waveforms of 6 different clips side by side
fig, axes = plt.subplots(3, 2, figsize=(16, 10))
sample_indices = [0, 4, 10, 25, 50, 100]

for ax, idx in zip(axes.flat, sample_indices):
    if idx >= len(manifest):
        continue
    entry = manifest[idx]
    wav_path = SYNTHETIC_DIR / entry['audio_file']
    if wav_path.exists():
        d, sr = sf.read(str(wav_path))
        t = np.arange(len(d)) / sr
        ax.plot(t, d, color='#4ecdc4', alpha=0.8, linewidth=0.5)
        ax.set_title(f'd{idx:04d}: "{entry["dothraki"][:40]}..."' if len(entry['dothraki']) > 40
                     else f'd{idx:04d}: "{entry["dothraki"]}"', fontsize=10)
        ax.set_xlim(0, t[-1])
        ax.set_ylim(-1, 1)

for ax in axes[-1]:
    ax.set_xlabel('Time (s)')
for ax in axes[:, 0]:
    ax.set_ylabel('Amplitude')

fig.suptitle('Sample Synthetic Waveforms', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Duration vs ground truth text length (characters)
gt_lens = []
clip_durations = []
for entry in manifest:
    wav_path = SYNTHETIC_DIR / entry['audio_file']
    if wav_path.exists():
        info = sf.info(str(wav_path))
        gt_lens.append(len(entry['dothraki']))
        clip_durations.append(info.duration)

fig, ax = plt.subplots(figsize=(10, 7))
ax.scatter(gt_lens, clip_durations, alpha=0.3, c='#4ecdc4', s=20, edgecolors='none')
ax.set_xlabel('Ground Truth Text Length (characters)')
ax.set_ylabel('Audio Duration (seconds)')
ax.set_title('Text Length vs Synthesized Audio Duration')

# Fit line
z = np.polyfit(gt_lens, clip_durations, 1)
p = np.poly1d(z)
x_line = np.linspace(min(gt_lens), max(gt_lens), 100)
ax.plot(x_line, p(x_line), 'r--', alpha=0.7, label=f'Linear fit (r={np.corrcoef(gt_lens, clip_durations)[0,1]:.3f})')
ax.legend()

plt.tight_layout()
plt.show()

print(f'Speaking rate: ~{np.mean(np.array(gt_lens)/np.array(clip_durations)):.1f} chars/sec')

---
## 3. Real Audio Processing

Real Dothraki clips from Game of Thrones contain music, sound effects,
and crowd noise. We use Demucs (htdemucs) to isolate vocal tracks.

**Pipeline:** Raw clip → Demucs vocal separation → 16kHz mono WAV

In [None]:
# List available real and processed audio
print('Raw audio clips:')
for wav in sorted(RAW_DIR.glob('*.wav')):
    info = sf.info(str(wav))
    print(f'  {wav.name}: {info.duration:.1f}s, {info.samplerate}Hz, {info.channels}ch')

print(f'\nProcessed (Demucs isolated) clips:')
for wav in sorted(PROCESSED_DIR.glob('*.wav')):
    info = sf.info(str(wav))
    print(f'  {wav.name}: {info.duration:.1f}s, {info.samplerate}Hz, {info.channels}ch')

In [None]:
# Compare raw vs isolated vocals for the Drogo speech
raw_path = RAW_DIR / 'drogo_rhaego_speech.wav'
clean_path = PROCESSED_DIR / 'drogo_rhaego_speech_vocals.wav'

if raw_path.exists() and clean_path.exists():
    raw_data, raw_sr = sf.read(str(raw_path))
    clean_data, clean_sr = sf.read(str(clean_path))

    # Take just the first 30 seconds for visualization
    raw_30s = raw_data[:raw_sr * 30] if len(raw_data) > raw_sr * 30 else raw_data
    clean_30s = clean_data[:clean_sr * 30] if len(clean_data) > clean_sr * 30 else clean_data

    fig, axes = plt.subplots(2, 2, figsize=(16, 10))

    # Raw waveform
    raw_t = np.arange(len(raw_30s)) / raw_sr
    if raw_30s.ndim > 1:
        raw_mono = raw_30s.mean(axis=1)
    else:
        raw_mono = raw_30s
    axes[0, 0].plot(raw_t, raw_mono, color='#ff6b6b', alpha=0.7, linewidth=0.3)
    axes[0, 0].set_title('Raw Audio — Waveform (first 30s)')
    axes[0, 0].set_ylabel('Amplitude')

    # Raw spectrogram
    axes[0, 1].specgram(raw_mono, NFFT=1024, Fs=raw_sr, noverlap=512, cmap='magma')
    axes[0, 1].set_title('Raw Audio — Spectrogram')
    axes[0, 1].set_ylabel('Frequency (Hz)')
    axes[0, 1].set_ylim(0, 8000)

    # Clean waveform
    clean_t = np.arange(len(clean_30s)) / clean_sr
    axes[1, 0].plot(clean_t, clean_30s, color='#4ecdc4', alpha=0.7, linewidth=0.3)
    axes[1, 0].set_title('Isolated Vocals — Waveform (first 30s)')
    axes[1, 0].set_ylabel('Amplitude')
    axes[1, 0].set_xlabel('Time (s)')

    # Clean spectrogram
    axes[1, 1].specgram(clean_30s, NFFT=1024, Fs=clean_sr, noverlap=512, cmap='magma')
    axes[1, 1].set_title('Isolated Vocals — Spectrogram')
    axes[1, 1].set_ylabel('Frequency (Hz)')
    axes[1, 1].set_xlabel('Time (s)')
    axes[1, 1].set_ylim(0, 8000)

    fig.suptitle('Demucs Vocal Isolation: Before vs After', fontsize=15, y=1.02)
    plt.tight_layout()
    plt.show()

    print(f'Raw: {raw_data.shape}, {raw_sr}Hz')
    print(f'Clean: {clean_data.shape}, {clean_sr}Hz')
    print(f'RMS reduction: {np.sqrt(np.mean(raw_mono**2)):.4f} → {np.sqrt(np.mean(clean_30s**2)):.4f}')
else:
    print('Raw or processed files not found. Run the pipeline on real clips first.')

---
## 4. Comparison: Synthetic vs Real

Comparing the spectral characteristics of synthesized Dothraki (espeak-ng)
versus real actor performances from Game of Thrones.

In [None]:
# Side-by-side spectrograms: synthetic vs real (isolated vocals)
synth_path = SYNTHETIC_DIR / manifest[4]['audio_file']

# Find a processed real clip
real_vocals = sorted(PROCESSED_DIR.glob('*_vocals.wav'))
real_path = real_vocals[0] if real_vocals else None

if synth_path.exists() and real_path and real_path.exists():
    synth_data, synth_sr = sf.read(str(synth_path))
    real_data, real_sr = sf.read(str(real_path))

    # Take first 5 seconds of real audio
    real_5s = real_data[:real_sr * 5]

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))

    ax1.specgram(synth_data, NFFT=512, Fs=synth_sr, noverlap=256, cmap='magma')
    ax1.set_title(f'Synthetic (espeak-ng)\n"{manifest[4]["dothraki"][:50]}"')
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('Frequency (Hz)')
    ax1.set_ylim(0, 8000)

    ax2.specgram(real_5s, NFFT=512, Fs=real_sr, noverlap=256, cmap='magma')
    ax2.set_title(f'Real Audio (GoT, isolated vocals)\n"{real_path.stem}"')
    ax2.set_xlabel('Time (s)')
    ax2.set_ylabel('Frequency (Hz)')
    ax2.set_ylim(0, 8000)

    fig.suptitle('Synthetic vs Real Dothraki Audio', fontsize=15, y=1.02)
    plt.tight_layout()
    plt.show()

    print('Key differences:')
    print('  Synthetic: clean, uniform energy, no background noise')
    print('  Real: dynamic range, emotional prosody, residual reverb/noise')
    print('  Synthetic lacks the natural pitch variation of actor performance')
else:
    print('Need both synthetic and real audio files for comparison.')

In [None]:
# Summary statistics table
print('='*60)
print('AUDIO PREPROCESSING SUMMARY')
print('='*60)
print(f'Synthetic clips:      {len(durations)}')
print(f'Total synth duration: {sum(durations)/60:.1f} minutes')
print(f'Avg clip duration:    {np.mean(durations):.2f}s')
print(f'Sample rate:          16kHz (mono)')
print(f'Real clips (raw):     {len(list(RAW_DIR.glob("*.wav")))}')
print(f'Real clips (processed): {len(list(PROCESSED_DIR.glob("*.wav")))}')
print(f'Separation model:     Demucs htdemucs')
print(f'Synthesis engine:     espeak-ng (IPA mode)')
print('='*60)