# Entropy vs Beta Analysis

This notebook analyzes how conditional entropy (the theoretical minimum loss) varies with beta for different vocabulary sizes.

- **Y-axis**: Conditional Entropy H(X_{t+1} | X_t)
- **X-axis**: Beta (temperature parameter)
- **Comparison**: Different vocabulary sizes
- **Methods**: Both analytical and empirical estimation

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

import RLM_files.datasets as datasets
import RLM_files.measures as measures

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

## Configuration

In [None]:
# Experiment parameters
vocab_sizes = [32, 64, 128, 256]  # Different vocabulary sizes to compare
beta_values = np.linspace(0.5, 5.0, 20)  # Range of beta values
L = 1  # Tree depth (2^L leaves)

# Seeds for reproducibility
seed_rules = 12345678
seed_samples = 56781234

# Sample sizes for estimation
num_analytical_samples = 2**15  # For analytical estimation
num_empirical_samples = 50000   # For empirical estimation

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## Analytical Entropy Estimation

Uses the analytical method from `measures.conditional_entropy()`

In [None]:
def compute_analytical_entropy(vocab_size, beta, L, seed_rules, num_samples):
    """
    Compute conditional entropy using analytical method.
    """
    # Create RLM
    rlm = datasets.RLM(
        v=vocab_size,
        L=L,
        beta=beta,
        seed_rules=seed_rules,
        seed_samples=seed_samples,
        num_data=None,
        probs=None,
        transform=None,
        device=device
    )
    
    # Compute analytical entropy
    conditional_entropy = measures.conditional_entropy(rlm.M, vocab_size, num_samples)
    marginal_entropy = measures.marginal(rlm.M, vocab_size, num_samples)
    
    return conditional_entropy, marginal_entropy


# Compute analytical entropies
print("Computing analytical entropies...")
analytical_results = {}

for vocab_size in vocab_sizes:
    print(f"\nVocabulary size: {vocab_size}")
    conditional_entropies = []
    marginal_entropies = []
    
    for beta in tqdm(beta_values, desc=f"V={vocab_size}"):
        cond_ent, marg_ent = compute_analytical_entropy(
            vocab_size, beta, L, seed_rules, num_analytical_samples
        )
        conditional_entropies.append(cond_ent)
        marginal_entropies.append(marg_ent)
    
    analytical_results[vocab_size] = {
        'conditional': np.array(conditional_entropies),
        'marginal': np.array(marginal_entropies)
    }

print("\nAnalytical computation complete!")

## Empirical Entropy Estimation

Uses empirical bigram distribution from generated sequences

In [None]:
def compute_empirical_entropy(vocab_size, beta, L, seed_rules, num_samples):
    """
    Compute conditional entropy using empirical method.
    """
    # Create RLM
    rlm = datasets.RLM(
        v=vocab_size,
        L=L,
        beta=beta,
        seed_rules=seed_rules,
        seed_samples=seed_samples,
        num_data=None,
        probs=None,
        transform=None,
        device=device
    )
    
    # Compute empirical statistics
    stats = rlm.compute_all_empirical_statistics(num_samples=num_samples)
    
    return stats['conditional_entropy'], stats['marginal_entropy']


# Compute empirical entropies
print("Computing empirical entropies...")
empirical_results = {}

for vocab_size in vocab_sizes:
    print(f"\nVocabulary size: {vocab_size}")
    conditional_entropies = []
    marginal_entropies = []
    
    for beta in tqdm(beta_values, desc=f"V={vocab_size}"):
        cond_ent, marg_ent = compute_empirical_entropy(
            vocab_size, beta, L, seed_rules, num_empirical_samples
        )
        conditional_entropies.append(cond_ent)
        marginal_entropies.append(marg_ent)
    
    empirical_results[vocab_size] = {
        'conditional': np.array(conditional_entropies),
        'marginal': np.array(marginal_entropies)
    }

print("\nEmpirical computation complete!")

## Visualization: Conditional Entropy vs Beta

In [None]:
# Plot analytical results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Analytical
for vocab_size in vocab_sizes:
    ax1.plot(
        beta_values,
        analytical_results[vocab_size]['conditional'],
        marker='o',
        label=f'V={vocab_size}',
        linewidth=2,
        markersize=6
    )

ax1.set_xlabel('Beta (Temperature)', fontsize=14)
ax1.set_ylabel('Conditional Entropy H(X_{t+1} | X_t)', fontsize=14)
ax1.set_title('Analytical Conditional Entropy vs Beta', fontsize=16, fontweight='bold')
ax1.legend(fontsize=12)
ax1.grid(True, alpha=0.3)

# Empirical
for vocab_size in vocab_sizes:
    ax2.plot(
        beta_values,
        empirical_results[vocab_size]['conditional'],
        marker='s',
        label=f'V={vocab_size}',
        linewidth=2,
        markersize=6
    )

ax2.set_xlabel('Beta (Temperature)', fontsize=14)
ax2.set_ylabel('Conditional Entropy H(X_{t+1} | X_t)', fontsize=14)
ax2.set_title('Empirical Conditional Entropy vs Beta', fontsize=16, fontweight='bold')
ax2.legend(fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Comparison: Analytical vs Empirical

In [None]:
# Compare analytical and empirical for each vocabulary size
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, vocab_size in enumerate(vocab_sizes):
    ax = axes[idx]
    
    # Plot both methods
    ax.plot(
        beta_values,
        analytical_results[vocab_size]['conditional'],
        marker='o',
        label='Analytical',
        linewidth=2,
        markersize=6
    )
    ax.plot(
        beta_values,
        empirical_results[vocab_size]['conditional'],
        marker='s',
        label='Empirical',
        linewidth=2,
        markersize=6,
        alpha=0.7
    )
    
    ax.set_xlabel('Beta (Temperature)', fontsize=12)
    ax.set_ylabel('Conditional Entropy', fontsize=12)
    ax.set_title(f'Vocabulary Size = {vocab_size}', fontsize=14, fontweight='bold')
    ax.legend(fontsize=11)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Marginal Entropy Analysis

In [None]:
# Plot marginal entropy
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Analytical
for vocab_size in vocab_sizes:
    ax1.plot(
        beta_values,
        analytical_results[vocab_size]['marginal'],
        marker='o',
        label=f'V={vocab_size}',
        linewidth=2,
        markersize=6
    )
    # Add theoretical maximum (log V)
    ax1.axhline(
        y=np.log(vocab_size),
        linestyle='--',
        alpha=0.5,
        label=f'log({vocab_size})' if vocab_size == vocab_sizes[0] else None
    )

ax1.set_xlabel('Beta (Temperature)', fontsize=14)
ax1.set_ylabel('Marginal Entropy H(X_{t+1})', fontsize=14)
ax1.set_title('Analytical Marginal Entropy vs Beta', fontsize=16, fontweight='bold')
ax1.legend(fontsize=12)
ax1.grid(True, alpha=0.3)

# Empirical
for vocab_size in vocab_sizes:
    ax2.plot(
        beta_values,
        empirical_results[vocab_size]['marginal'],
        marker='s',
        label=f'V={vocab_size}',
        linewidth=2,
        markersize=6
    )
    # Add theoretical maximum (log V)
    ax2.axhline(
        y=np.log(vocab_size),
        linestyle='--',
        alpha=0.5
    )

ax2.set_xlabel('Beta (Temperature)', fontsize=14)
ax2.set_ylabel('Marginal Entropy H(X_{t+1})', fontsize=14)
ax2.set_title('Empirical Marginal Entropy vs Beta', fontsize=16, fontweight='bold')
ax2.legend(fontsize=12)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
print("\n" + "="*80)
print("SUMMARY: Conditional Entropy Range by Vocabulary Size")
print("="*80)

for vocab_size in vocab_sizes:
    print(f"\nVocabulary Size: {vocab_size}")
    print("-" * 40)
    
    # Analytical
    anal_cond = analytical_results[vocab_size]['conditional']
    print(f"Analytical Conditional Entropy:")
    print(f"  Min (β={beta_values[np.argmin(anal_cond)]:.2f}): {anal_cond.min():.4f}")
    print(f"  Max (β={beta_values[np.argmax(anal_cond)]:.2f}): {anal_cond.max():.4f}")
    print(f"  Range: {anal_cond.max() - anal_cond.min():.4f}")
    
    # Empirical
    emp_cond = empirical_results[vocab_size]['conditional']
    print(f"\nEmpirical Conditional Entropy:")
    print(f"  Min (β={beta_values[np.argmin(emp_cond)]:.2f}): {emp_cond.min():.4f}")
    print(f"  Max (β={beta_values[np.argmax(emp_cond)]:.2f}): {emp_cond.max():.4f}")
    print(f"  Range: {emp_cond.max() - emp_cond.min():.4f}")
    
    # Difference
    diff = np.abs(anal_cond - emp_cond)
    print(f"\nAnalytical vs Empirical:")
    print(f"  Mean absolute difference: {diff.mean():.4f}")
    print(f"  Max absolute difference: {diff.max():.4f}")
    
    # Theoretical maximum
    print(f"\nTheoretical Maximum (log V): {np.log(vocab_size):.4f}")

## Save Results

In [None]:
# Save results for later analysis
results = {
    'beta_values': beta_values,
    'vocab_sizes': vocab_sizes,
    'analytical': analytical_results,
    'empirical': empirical_results,
    'config': {
        'L': L,
        'seed_rules': seed_rules,
        'seed_samples': seed_samples,
        'num_analytical_samples': num_analytical_samples,
        'num_empirical_samples': num_empirical_samples
    }
}

torch.save(results, 'entropy_vs_beta_results.pt')
print("\nResults saved to 'entropy_vs_beta_results.pt'")