# Apple TTS Training Data Explorer

Interactive exploration of Apple's text-to-speech training dataset with phoneme analysis and audio playback.

## Setup and Data Loading

In [None]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Load the training data
samples = []
with open('/mnt/user-data/uploads/metadata_data.json', 'r') as f:
    for line in f:
        if line.strip():
            samples.append(json.loads(line))

# Create DataFrame
df = pd.DataFrame(samples)

print(f"Loaded {len(df)} training samples")
df.head()

## Dataset Statistics

In [None]:
# Basic statistics
print("Dataset Overview")
print("=" * 50)
print(f"Total Samples: {len(df)}")
print(f"Categories: {df['script_title'].nunique()}")
print(f"Average Duration: {df['sentence_estimated_duration'].mean():.2f}s")
print(f"Total Duration: {df['sentence_estimated_duration'].sum():.2f}s")
print(f"\nCategories: {', '.join(df['script_title'].unique())}")

In [None]:
# Category breakdown
category_stats = df.groupby('script_title').agg({
    'utterance_name': 'count',
    'sentence_estimated_duration': ['sum', 'mean']
}).round(2)

category_stats.columns = ['Count', 'Total Duration', 'Avg Duration']
category_stats

## Visualizations

In [None]:
# Category distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sample count by category
df['script_title'].value_counts().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Samples by Category', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Number of Samples')
axes[0].tick_params(axis='x', rotation=45)

# Duration distribution
df['sentence_estimated_duration'].hist(bins=30, ax=axes[1], color='coral', edgecolor='black')
axes[1].set_title('Duration Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Duration (seconds)')
axes[1].set_ylabel('Frequency')
axes[1].axvline(df['sentence_estimated_duration'].mean(), color='red', 
               linestyle='--', linewidth=2, label='Mean')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Box plot of durations by category
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='script_title', y='sentence_estimated_duration', palette='Set2')
plt.title('Duration Distribution by Category', fontsize=14, fontweight='bold')
plt.xlabel('Category')
plt.ylabel('Duration (seconds)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Phoneme Analysis

In [None]:
# Extract all phonemes
all_phonemes = []
for seq in df['phone_sequence']:
    phonemes = seq.split(' # ')
    all_phonemes.extend([p.strip() for p in phonemes if p.strip()])

# Count phoneme frequency
phoneme_counts = Counter(all_phonemes)
top_phonemes = dict(phoneme_counts.most_common(20))

print(f"Total phoneme instances: {len(all_phonemes)}")
print(f"Unique phonemes: {len(phoneme_counts)}")
print(f"\nTop 20 most common phonemes:")
for phoneme, count in list(phoneme_counts.most_common(20)):
    print(f"  {phoneme}: {count}")

In [None]:
# Visualize top phonemes
plt.figure(figsize=(14, 6))
phonemes, counts = zip(*phoneme_counts.most_common(20))
plt.bar(range(len(phonemes)), counts, color='teal', edgecolor='black')
plt.xlabel('Phoneme')
plt.ylabel('Frequency')
plt.title('Top 20 Most Common Phonemes', fontsize=14, fontweight='bold')
plt.xticks(range(len(phonemes)), phonemes, rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Phoneme count per utterance
df['phoneme_count'] = df['phone_sequence'].apply(lambda x: len(x.split(' # ')))

print(f"Average phonemes per utterance: {df['phoneme_count'].mean():.1f}")
print(f"Min phonemes: {df['phoneme_count'].min()}")
print(f"Max phonemes: {df['phoneme_count'].max()}")

plt.figure(figsize=(12, 5))
df['phoneme_count'].hist(bins=30, color='purple', edgecolor='black', alpha=0.7)
plt.xlabel('Number of Phonemes')
plt.ylabel('Frequency')
plt.title('Phoneme Count Distribution', fontsize=14, fontweight='bold')
plt.axvline(df['phoneme_count'].mean(), color='red', linestyle='--', 
           linewidth=2, label='Mean')
plt.legend()
plt.tight_layout()
plt.show()

## Text Analysis

In [None]:
# Word count analysis
df['word_count'] = df['words'].apply(lambda x: len(x.split()))

print(f"Average words per sample: {df['word_count'].mean():.1f}")
print(f"Min words: {df['word_count'].min()}")
print(f"Max words: {df['word_count'].max()}")

# Correlation between word count and duration
correlation = df['word_count'].corr(df['sentence_estimated_duration'])
print(f"\nCorrelation (word count vs duration): {correlation:.3f}")

In [None]:
# Scatter plot: word count vs duration
plt.figure(figsize=(12, 6))
plt.scatter(df['word_count'], df['sentence_estimated_duration'], 
           alpha=0.6, c=df['script_title'].astype('category').cat.codes, 
           cmap='viridis', s=50)
plt.xlabel('Word Count')
plt.ylabel('Duration (seconds)')
plt.title('Word Count vs Duration', fontsize=14, fontweight='bold')
plt.colorbar(label='Category', ticks=range(df['script_title'].nunique()))

# Add trend line
z = np.polyfit(df['word_count'], df['sentence_estimated_duration'], 1)
p = np.poly1d(z)
plt.plot(df['word_count'], p(df['word_count']), "r--", alpha=0.8, linewidth=2)

plt.tight_layout()
plt.show()

## Sample Browser

In [None]:
# Function to search samples
def search_samples(query):
    query = query.lower()
    results = df[df['words'].str.lower().str.contains(query)]
    return results[['utterance_name', 'script_title', 'words', 'sentence_estimated_duration']]

# Example search
search_samples('the')

In [None]:
# Display random samples from each category
for category in df['script_title'].unique():
    print(f"\n{'='*60}")
    print(f"Category: {category.upper()}")
    print('='*60)
    samples = df[df['script_title'] == category].sample(min(3, len(df[df['script_title'] == category])))
    for _, sample in samples.iterrows():
        print(f"\nID: {sample['utterance_name']}")
        print(f"Text: {sample['words']}")
        print(f"Duration: {sample['sentence_estimated_duration']}s")
        print(f"Phonemes: {sample['phone_sequence'][:80]}...")

## Export Statistics

In [None]:
# Create comprehensive statistics report
stats_report = {
    'total_samples': len(df),
    'categories': df['script_title'].nunique(),
    'total_duration': df['sentence_estimated_duration'].sum(),
    'avg_duration': df['sentence_estimated_duration'].mean(),
    'avg_words': df['word_count'].mean(),
    'avg_phonemes': df['phoneme_count'].mean(),
    'unique_phonemes': len(phoneme_counts),
    'category_breakdown': df.groupby('script_title').size().to_dict()
}

print("Dataset Statistics Summary")
print("="*50)
for key, value in stats_report.items():
    if key != 'category_breakdown':
        print(f"{key.replace('_', ' ').title()}: {value if isinstance(value, int) else f'{value:.2f}'}")

print("\nCategory Breakdown:")
for cat, count in stats_report['category_breakdown'].items():
    print(f"  {cat}: {count}")

## Conclusion

This notebook provides comprehensive analysis of Apple's TTS training data, including:
- Dataset statistics and distributions
- Phoneme frequency analysis
- Text-to-duration correlations
- Category comparisons
- Sample browsing and search capabilities