# Tokenizer Comparison Analysis

This notebook provides interactive analysis and visualization of tokenizer comparison results.

In [None]:
import json
import sys
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Add parent directory to path
sys.path.insert(0, str(Path.cwd().parent))

from visualization import generate_plots

## Load Results

Load the results from a previous experiment run.

In [None]:
# Load results
results_dir = Path('../results')
results_file = results_dir / 'results.json'

with open(results_file) as f:
    results = json.load(f)

df = pd.DataFrame(results)
print(f"Loaded {len(df)} evaluation results")
df.head()

## Summary Statistics

In [None]:
# Group by tokenizer
print("Average metrics by tokenizer:\n")
print(df.groupby('tokenizer')[[
    'tokens_per_1000_chars',
    'tokens_per_100_words',
    'reconstruction_similarity',
    'throughput_tokens_per_sec'
]].mean())

## Visualizations

### Token Efficiency

In [None]:
# Plot tokens per 1000 chars
plt.figure(figsize=(12, 6))
grouped = df.groupby('tokenizer')['tokens_per_1000_chars'].mean().sort_values()
grouped.plot(kind='barh', color='steelblue')
plt.xlabel('Tokens per 1000 Characters')
plt.ylabel('Tokenizer')
plt.title('Token Efficiency Comparison')
plt.grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.show()

### Reconstruction Quality

In [None]:
# Plot reconstruction similarity
plt.figure(figsize=(12, 6))
tokenizers = df['tokenizer'].unique()
data = [df[df['tokenizer'] == tok]['reconstruction_similarity'].values for tok in tokenizers]

plt.boxplot(data, labels=tokenizers, patch_artist=True)
plt.ylabel('Reconstruction Similarity')
plt.xlabel('Tokenizer')
plt.title('Reconstruction Fidelity')
plt.ylim([0, 1.05])
plt.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

### Throughput Comparison

In [None]:
# Plot throughput
plt.figure(figsize=(12, 6))
grouped = df.groupby('tokenizer')['throughput_tokens_per_sec'].mean().sort_values(ascending=False)
grouped.plot(kind='bar', color='coral')
plt.ylabel('Throughput (tokens/sec)')
plt.xlabel('Tokenizer')
plt.title('Tokenization Speed')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## Token Examples

Load and explore tokenization examples.

In [None]:
# Load sentence examples if available
examples_file = results_dir / 'sentence_examples.json'
if examples_file.exists():
    with open(examples_file) as f:
        examples = json.load(f)
    
    # Show first example
    first_key = list(examples.keys())[0]
    first_example = examples[first_key][0]
    
    print(f"Example from {first_key}:\n")
    print(f"Text: {first_example['text']}")
    print(f"\nTokens ({first_example['token_count']}): {first_example['tokens'][:20]}...")
else:
    print("No sentence examples found")

## Custom Analysis

Add your own analysis and visualizations below.

In [None]:
# Your custom analysis here