# Vocabulary & Beta Experiments Analysis

This notebook analyzes results across multiple vocabulary sizes and beta values:
- Beta values: [0.6, 0.8, 1.2, 1.8, 2.4, 3.0]
- Vocabulary sizes: [64, 128, 256]
- 6 repetitions per configuration

**Analysis:**
1. Loss curves for each vocab size
2. Comparison across beta values
3. Comparison across vocabulary sizes
4. Check convergence to theoretical minimum

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 12

## Configuration

In [None]:
# Experiment parameters
beta_values = [0.6, 0.8, 1.2, 1.8, 2.4, 3.0]
vocab_sizes = [64, 128, 256]
num_repetitions = 6
num_layers = 1

# Results directory
results_dir = Path('results/vocab_beta_experiments')

print(f"Results directory: {results_dir}")
print(f"Beta values: {beta_values}")
print(f"Vocabulary sizes: {vocab_sizes}")
print(f"Repetitions: {num_repetitions}")

## Load Results

In [None]:
def load_experiment_results(vocab_size, beta, rep):
    """Load results for a specific configuration."""
    filename = results_dir / f'rlm_v{vocab_size}_L{num_layers}_beta{beta}_rep{rep}.pt'
    
    if not filename.exists():
        print(f"Warning: File not found: {filename}")
        return None
    
    data = torch.load(filename, weights_only=False)
    return data

# Load all results
all_results = {}

for vocab_size in vocab_sizes:
    all_results[vocab_size] = {}
    
    for beta in beta_values:
        all_results[vocab_size][beta] = []
        
        for rep in range(1, num_repetitions + 1):
            result = load_experiment_results(vocab_size, beta, rep)
            
            if result is not None:
                all_results[vocab_size][beta].append(result)
        
        print(f"V={vocab_size}, β={beta}: Loaded {len(all_results[vocab_size][beta])} results")

print("\nAll results loaded!")

## Extract Theoretical Minimums and Dynamics

In [None]:
def extract_dynamics(result_data):
    """Extract training dynamics from result data."""
    if 'output' not in result_data:
        return None
    
    dynamics = result_data['output']['dynamics']
    
    steps = [d['t'] for d in dynamics]
    test_losses = [d['testloss'] for d in dynamics]
    test_accs = [d['testacc'] for d in dynamics]
    
    return {
        'steps': np.array(steps),
        'test_loss': np.array(test_losses),
        'test_acc': np.array(test_accs)
    }

# Extract theoretical minimums and dynamics
theoretical_minimums = {}
marginal_entropies = {}
all_dynamics = {}

for vocab_size in vocab_sizes:
    theoretical_minimums[vocab_size] = {}
    marginal_entropies[vocab_size] = {}
    all_dynamics[vocab_size] = {}
    
    for beta in beta_values:
        theoretical_minimums[vocab_size][beta] = []
        marginal_entropies[vocab_size][beta] = []
        all_dynamics[vocab_size][beta] = []
        
        for result in all_results[vocab_size][beta]:
            if result is not None and 'output' in result:
                # Extract pre-computed entropy
                entropy = result['output']['entropy']
                marginal = result['output']['marginal']
                theoretical_minimums[vocab_size][beta].append(entropy)
                marginal_entropies[vocab_size][beta].append(marginal)
                
                # Extract dynamics
                dynamics = extract_dynamics(result)
                if dynamics is not None:
                    all_dynamics[vocab_size][beta].append(dynamics)

print("Theoretical minimums and dynamics extracted!")
print("\nSummary:")
for vocab_size in vocab_sizes:
    print(f"\nVocabulary size: {vocab_size}")
    for beta in beta_values:
        if len(theoretical_minimums[vocab_size][beta]) > 0:
            mean_ent = np.mean(theoretical_minimums[vocab_size][beta])
            print(f"  β={beta}: Conditional entropy = {mean_ent:.4f}")

## Plot 1: Loss Curves for Each Vocabulary Size

In [None]:
for vocab_size in vocab_sizes:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes = axes.flatten()
    
    colors = plt.cm.tab10(np.linspace(0, 1, len(beta_values)))
    
    for idx, beta in enumerate(beta_values):
        ax = axes[idx]
        
        if len(all_dynamics[vocab_size][beta]) == 0:
            ax.text(0.5, 0.5, f'No data for β={beta}', 
                    ha='center', va='center', transform=ax.transAxes)
            continue
        
        # Get common steps
        steps = all_dynamics[vocab_size][beta][0]['steps']
        
        # Collect all losses
        all_losses = []
        for dynamics in all_dynamics[vocab_size][beta]:
            if len(dynamics['steps']) == len(steps):
                all_losses.append(dynamics['test_loss'])
        
        if len(all_losses) == 0:
            continue
        
        all_losses = np.array(all_losses)
        mean_loss = np.mean(all_losses, axis=0)
        std_loss = np.std(all_losses, axis=0)
        
        # Plot mean with shaded std
        ax.plot(steps, mean_loss, linewidth=2, label='Mean test loss', color=colors[idx])
        ax.fill_between(steps, mean_loss - std_loss, mean_loss + std_loss, 
                         alpha=0.3, color=colors[idx], label='± 1 std')
        
        # Plot theoretical minimum
        if len(theoretical_minimums[vocab_size][beta]) > 0:
            mean_theoretical = np.mean(theoretical_minimums[vocab_size][beta])
            ax.axhline(y=mean_theoretical, color='red', linestyle='--', 
                       linewidth=2, label=f'Theoretical: {mean_theoretical:.3f}')
        
        ax.set_xlabel('Training Steps', fontsize=11)
        ax.set_ylabel('Test Loss', fontsize=11)
        ax.set_title(f'β = {beta}', fontsize=12, fontweight='bold')
        ax.legend(fontsize=9)
        ax.grid(True, alpha=0.3)
        ax.set_yscale('log')
    
    fig.suptitle(f'Vocabulary Size = {vocab_size}', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

## Plot 2: Compare All Betas for Each Vocabulary Size

In [None]:
for vocab_size in vocab_sizes:
    fig, ax = plt.subplots(figsize=(14, 8))
    
    colors = plt.cm.viridis(np.linspace(0, 1, len(beta_values)))
    
    for idx, beta in enumerate(beta_values):
        if len(all_dynamics[vocab_size][beta]) == 0:
            continue
        
        steps = all_dynamics[vocab_size][beta][0]['steps']
        
        # Collect all losses
        all_losses = []
        for dynamics in all_dynamics[vocab_size][beta]:
            if len(dynamics['steps']) == len(steps):
                all_losses.append(dynamics['test_loss'])
        
        if len(all_losses) == 0:
            continue
        
        all_losses = np.array(all_losses)
        mean_loss = np.mean(all_losses, axis=0)
        std_loss = np.std(all_losses, axis=0)
        
        # Plot mean
        ax.plot(steps, mean_loss, linewidth=2.5, label=f'β={beta}', color=colors[idx])
        ax.fill_between(steps, mean_loss - std_loss, mean_loss + std_loss, 
                         alpha=0.2, color=colors[idx])
    
    ax.set_xlabel('Training Steps', fontsize=14)
    ax.set_ylabel('Test Loss (Cross-Entropy)', fontsize=14)
    ax.set_title(f'Vocabulary Size = {vocab_size}: Comparison Across Beta Values', 
                 fontsize=16, fontweight='bold')
    ax.legend(fontsize=12, loc='upper right')
    ax.grid(True, alpha=0.3)
    ax.set_yscale('log')
    
    plt.tight_layout()
    plt.show()

## Plot 3: Compare Vocabulary Sizes for Each Beta

In [None]:
for beta in beta_values:
    fig, ax = plt.subplots(figsize=(14, 8))
    
    colors = plt.cm.plasma(np.linspace(0, 1, len(vocab_sizes)))
    
    for idx, vocab_size in enumerate(vocab_sizes):
        if len(all_dynamics[vocab_size][beta]) == 0:
            continue
        
        steps = all_dynamics[vocab_size][beta][0]['steps']
        
        # Collect all losses
        all_losses = []
        for dynamics in all_dynamics[vocab_size][beta]:
            if len(dynamics['steps']) == len(steps):
                all_losses.append(dynamics['test_loss'])
        
        if len(all_losses) == 0:
            continue
        
        all_losses = np.array(all_losses)
        mean_loss = np.mean(all_losses, axis=0)
        std_loss = np.std(all_losses, axis=0)
        
        # Plot mean
        ax.plot(steps, mean_loss, linewidth=2.5, label=f'V={vocab_size}', color=colors[idx])
        ax.fill_between(steps, mean_loss - std_loss, mean_loss + std_loss, 
                         alpha=0.2, color=colors[idx])
    
    ax.set_xlabel('Training Steps', fontsize=14)
    ax.set_ylabel('Test Loss (Cross-Entropy)', fontsize=14)
    ax.set_title(f'Beta = {beta}: Comparison Across Vocabulary Sizes', 
                 fontsize=16, fontweight='bold')
    ax.legend(fontsize=12, loc='upper right')
    ax.grid(True, alpha=0.3)
    ax.set_yscale('log')
    
    plt.tight_layout()
    plt.show()

## Plot 4: Final Loss vs Theoretical Minimum (Heatmap)

In [None]:
# Create matrices for heatmap
final_loss_matrix = np.zeros((len(vocab_sizes), len(beta_values)))
theoretical_matrix = np.zeros((len(vocab_sizes), len(beta_values)))
gap_matrix = np.zeros((len(vocab_sizes), len(beta_values)))

for i, vocab_size in enumerate(vocab_sizes):
    for j, beta in enumerate(beta_values):
        if len(all_dynamics[vocab_size][beta]) > 0:
            # Final losses
            final_losses = [dynamics['test_loss'][-1] for dynamics in all_dynamics[vocab_size][beta]]
            final_loss_matrix[i, j] = np.mean(final_losses)
            
            # Theoretical minimum
            if len(theoretical_minimums[vocab_size][beta]) > 0:
                theoretical_matrix[i, j] = np.mean(theoretical_minimums[vocab_size][beta])
                gap_matrix[i, j] = final_loss_matrix[i, j] - theoretical_matrix[i, j]
        else:
            final_loss_matrix[i, j] = np.nan
            theoretical_matrix[i, j] = np.nan
            gap_matrix[i, j] = np.nan

# Plot heatmaps
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Final loss
im1 = axes[0].imshow(final_loss_matrix, cmap='YlOrRd', aspect='auto')
axes[0].set_xticks(range(len(beta_values)))
axes[0].set_xticklabels([f'{b}' for b in beta_values])
axes[0].set_yticks(range(len(vocab_sizes)))
axes[0].set_yticklabels([f'{v}' for v in vocab_sizes])
axes[0].set_xlabel('Beta', fontsize=12)
axes[0].set_ylabel('Vocabulary Size', fontsize=12)
axes[0].set_title('Final Test Loss', fontsize=14, fontweight='bold')
plt.colorbar(im1, ax=axes[0])

# Theoretical minimum
im2 = axes[1].imshow(theoretical_matrix, cmap='YlOrRd', aspect='auto')
axes[1].set_xticks(range(len(beta_values)))
axes[1].set_xticklabels([f'{b}' for b in beta_values])
axes[1].set_yticks(range(len(vocab_sizes)))
axes[1].set_yticklabels([f'{v}' for v in vocab_sizes])
axes[1].set_xlabel('Beta', fontsize=12)
axes[1].set_ylabel('Vocabulary Size', fontsize=12)
axes[1].set_title('Theoretical Minimum', fontsize=14, fontweight='bold')
plt.colorbar(im2, ax=axes[1])

# Gap
im3 = axes[2].imshow(gap_matrix, cmap='RdYlGn_r', aspect='auto')
axes[2].set_xticks(range(len(beta_values)))
axes[2].set_xticklabels([f'{b}' for b in beta_values])
axes[2].set_yticks(range(len(vocab_sizes)))
axes[2].set_yticklabels([f'{v}' for v in vocab_sizes])
axes[2].set_xlabel('Beta', fontsize=12)
axes[2].set_ylabel('Vocabulary Size', fontsize=12)
axes[2].set_title('Optimization Gap', fontsize=14, fontweight='bold')
plt.colorbar(im3, ax=axes[2])

plt.tight_layout()
plt.show()

## Summary Statistics Table

In [None]:
print("\n" + "="*100)
print("SUMMARY: Vocabulary & Beta Experiments")
print("="*100)

for vocab_size in vocab_sizes:
    print(f"\n{'='*100}")
    print(f"Vocabulary Size: {vocab_size}")
    print(f"{'='*100}")
    print(f"{'Beta':<8} {'Theoretical':<15} {'Final Loss':<15} {'Gap':<12} {'% Above':<10} {'Final Acc':<12}")
    print("-" * 100)
    
    for beta in beta_values:
        if len(all_dynamics[vocab_size][beta]) == 0:
            continue
        
        # Theoretical minimum
        theo_mean = np.mean(theoretical_minimums[vocab_size][beta])
        theo_std = np.std(theoretical_minimums[vocab_size][beta])
        
        # Final losses
        final_losses = [dynamics['test_loss'][-1] for dynamics in all_dynamics[vocab_size][beta]]
        final_mean = np.mean(final_losses)
        final_std = np.std(final_losses)
        
        # Gap
        gap = final_mean - theo_mean
        pct_above = (gap / theo_mean) * 100
        
        # Final accuracy
        final_accs = [dynamics['test_acc'][-1] for dynamics in all_dynamics[vocab_size][beta]]
        acc_mean = np.mean(final_accs)
        acc_std = np.std(final_accs)
        
        print(f"{beta:<8.1f} {theo_mean:>6.4f}±{theo_std:<5.4f} {final_mean:>6.4f}±{final_std:<5.4f} "
              f"{gap:>6.4f}      {pct_above:>6.2f}%    {acc_mean:>5.4f}±{acc_std:<5.4f}")

print("\n" + "="*100)