# Penn Treebank Vocabulary Builder

This notebook builds the vocabulary for the Penn Treebank dataset based on the analysis recommendations:
- **Recommended vocab size**: 30,000 words (frequency ≥ 3)
- **Coverage**: 99.1% of training tokens
- **Special tokens**: `<pad>`, `<unk>`, `<eos>`

In [None]:
import sys
import os
sys.path.append('../src')

import torch
from data_loader import Vocabulary, load_ptb_data
from collections import Counter
import pickle
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Set paths
data_dir = '../data/ptb'
vocab_file = os.path.join(data_dir, 'vocab.pkl')

print(f"Data directory: {data_dir}")
print(f"Vocabulary file: {vocab_file}")

## Step 1: Load Raw Text Data

First, let's load the raw Penn Treebank text files and examine the data.

In [None]:
# Load raw text files
train_file = os.path.join(data_dir, 'ptb.train.txt')
valid_file = os.path.join(data_dir, 'ptb.valid.txt')
test_file = os.path.join(data_dir, 'ptb.test.txt')

# Check if files exist
for file_path, split_name in [(train_file, 'train'), (valid_file, 'valid'), (test_file, 'test')]:
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read().strip()
            lines = content.split('\n')
            words = content.split()
            print(f"{split_name.upper()}: {len(lines):,} sentences, {len(words):,} words")
    else:
        print(f"{split_name.upper()}: File not found at {file_path}")

## Step 2: Build Word Frequency Distribution

Let's analyze the word frequency distribution to determine optimal vocabulary size.

In [None]:
# Load training text
with open(train_file, 'r', encoding='utf-8') as f:
    train_text = f.read().strip()

# Count word frequencies
word_counts = Counter()
for line in train_text.split('\n'):
    if line.strip():
        words = line.strip().split()
        for word in words:
            word_counts[word] += 1

print(f"Total unique words in training set: {len(word_counts):,}")
print(f"Total word tokens in training set: {sum(word_counts.values()):,}")

# Show most common words
print("\nTop 20 most frequent words:")
for word, count in word_counts.most_common(20):
    print(f"  {word}: {count:,}")

In [None]:
# Analyze frequency distribution
frequencies = list(word_counts.values())
frequencies.sort(reverse=True)

# Plot frequency distribution
plt.figure(figsize=(15, 5))

# Plot 1: Full distribution (log scale)
plt.subplot(1, 3, 1)
plt.plot(frequencies)
plt.yscale('log')
plt.xlabel('Word Rank')
plt.ylabel('Frequency (log scale)')
plt.title('Word Frequency Distribution')
plt.grid(True)

# Plot 2: Cumulative coverage
cumulative_freq = np.cumsum(frequencies)
coverage = cumulative_freq / cumulative_freq[-1]

plt.subplot(1, 3, 2)
plt.plot(coverage[:10000])  # First 10K words
plt.xlabel('Vocabulary Size')
plt.ylabel('Token Coverage')
plt.title('Cumulative Token Coverage')
plt.grid(True)

# Plot 3: Different minimum frequency thresholds
min_freqs = [1, 2, 3, 4, 5, 10, 20]
vocab_sizes = []
coverages = []

for min_freq in min_freqs:
    vocab_size = sum(1 for count in word_counts.values() if count >= min_freq)
    covered_tokens = sum(count for count in word_counts.values() if count >= min_freq)
    coverage = covered_tokens / sum(word_counts.values())
    vocab_sizes.append(vocab_size)
    coverages.append(coverage)

plt.subplot(1, 3, 3)
for i, min_freq in enumerate(min_freqs):
    plt.scatter(vocab_sizes[i], coverages[i] * 100, s=60, 
               label=f'min_freq={min_freq}')

plt.xlabel('Vocabulary Size')
plt.ylabel('Coverage (%)')
plt.title('Vocabulary Size vs Coverage')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# Print statistics for different thresholds
print("\nVocabulary size and coverage for different minimum frequencies:")
print("Min Freq | Vocab Size | Coverage")
print("-" * 35)
for i, min_freq in enumerate(min_freqs):
    print(f"{min_freq:8d} | {vocab_sizes[i]:10,d} | {coverages[i]*100:7.2f}%")

## Step 3: Build Vocabulary with Optimal Parameters

Based on the analysis, we'll build the vocabulary with minimum frequency = 3 to get ~30K words as recommended.

In [None]:
# Remove existing vocabulary if it exists
if os.path.exists(vocab_file):
    os.remove(vocab_file)
    print(f"Removed existing vocabulary file: {vocab_file}")

# Build vocabulary with min_freq=3 (recommended setting)
MIN_FREQ = 3

print(f"Building vocabulary with min_freq={MIN_FREQ}...")

# Create vocabulary object
vocab = Vocabulary(special_tokens=['<pad>', '<unk>', '<eos>'])

# Build vocabulary from training text
vocab.build_vocab([train_text], min_freq=MIN_FREQ)

print(f"Vocabulary built successfully!")
print(f"Vocabulary size: {len(vocab):,}")
print(f"Word count entries: {len(vocab.word_count):,}")

# Save vocabulary
vocab.save(vocab_file)
print(f"Vocabulary saved to: {vocab_file}")

## Step 4: Analyze Built Vocabulary

In [None]:
# Analyze the built vocabulary
print("Vocabulary Analysis:")
print("=" * 50)
print(f"Total vocabulary size: {len(vocab):,}")
print(f"Special tokens: {vocab.special_tokens}")

# Check special token indices
for token in vocab.special_tokens:
    if token in vocab.word2idx:
        print(f"  {token}: index {vocab.word2idx[token]}")

# Calculate coverage on training set
covered_tokens = 0
total_tokens = 0

for word, count in vocab.word_count.items():
    total_tokens += count
    if word in vocab.word2idx:
        covered_tokens += count

coverage = covered_tokens / total_tokens
print(f"\nTraining set coverage: {coverage*100:.2f}%")

# Show vocabulary statistics
vocab_word_counts = [vocab.word_count[word] for word in vocab.word2idx if word not in vocab.special_tokens]
print(f"Average word frequency in vocab: {np.mean(vocab_word_counts):.2f}")
print(f"Median word frequency in vocab: {np.median(vocab_word_counts):.2f}")
print(f"Min word frequency in vocab: {min(vocab_word_counts)}")
print(f"Max word frequency in vocab: {max(vocab_word_counts)}")

In [None]:
# Test vocabulary encoding/decoding
print("Testing vocabulary encoding/decoding:")
print("=" * 50)

# Test with sample sentences
test_sentences = [
    "the quick brown fox jumps over the lazy dog <eos>",
    "natural language processing is fascinating <eos>",
    "out-of-vocabulary words should become <unk> tokens <eos>"
]

for i, sentence in enumerate(test_sentences):
    print(f"\nTest {i+1}:")
    print(f"Original: {sentence}")
    
    # Encode
    encoded = vocab.encode(sentence)
    print(f"Encoded:  {encoded[:10]}..." if len(encoded) > 10 else f"Encoded:  {encoded}")
    
    # Decode
    decoded = vocab.decode(encoded)
    print(f"Decoded:  {decoded}")
    
    # Check for unknown words
    unknown_count = encoded.count(vocab.word2idx['<unk>'])
    print(f"Unknown words: {unknown_count}")

## Step 5: Evaluate on Validation and Test Sets

In [None]:
# Evaluate vocabulary coverage on validation and test sets
def evaluate_coverage(text_file, vocab, split_name):
    """Evaluate vocabulary coverage on a text file."""
    with open(text_file, 'r', encoding='utf-8') as f:
        text = f.read().strip()
    
    words = text.split()
    total_words = len(words)
    
    # Count known and unknown words
    known_words = 0
    unknown_words = 0
    
    for word in words:
        if word in vocab.word2idx:
            known_words += 1
        else:
            unknown_words += 1
    
    coverage = known_words / total_words if total_words > 0 else 0
    oov_rate = unknown_words / total_words if total_words > 0 else 0
    
    print(f"{split_name.upper()} Set Evaluation:")
    print(f"  Total words: {total_words:,}")
    print(f"  Known words: {known_words:,}")
    print(f"  Unknown words: {unknown_words:,}")
    print(f"  Coverage: {coverage*100:.2f}%")
    print(f"  OOV rate: {oov_rate*100:.2f}%")
    
    return coverage, oov_rate

# Evaluate on all splits
print("Vocabulary Coverage Analysis:")
print("=" * 50)

# Training set (should be very high since vocab was built from it)
train_coverage, train_oov = evaluate_coverage(train_file, vocab, 'train')
print()

# Validation set
valid_coverage, valid_oov = evaluate_coverage(valid_file, vocab, 'valid')
print()

# Test set
test_coverage, test_oov = evaluate_coverage(test_file, vocab, 'test')

In [None]:
# Create summary visualization
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Coverage comparison
splits = ['Train', 'Valid', 'Test']
coverages = [train_coverage*100, valid_coverage*100, test_coverage*100]
oov_rates = [train_oov*100, valid_oov*100, test_oov*100]

ax1 = axes[0]
bars1 = ax1.bar(splits, coverages, color=['skyblue', 'lightcoral', 'lightgreen'])
ax1.set_ylabel('Coverage (%)')
ax1.set_title('Vocabulary Coverage by Split')
ax1.set_ylim(90, 100)

# Add value labels on bars
for bar, coverage in zip(bars1, coverages):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1, 
             f'{coverage:.2f}%', ha='center', va='bottom')

# OOV rate comparison
ax2 = axes[1]
bars2 = ax2.bar(splits, oov_rates, color=['skyblue', 'lightcoral', 'lightgreen'])
ax2.set_ylabel('OOV Rate (%)')
ax2.set_title('Out-of-Vocabulary Rate by Split')

# Add value labels on bars
for bar, oov in zip(bars2, oov_rates):
    ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.05, 
             f'{oov:.2f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Step 6: Final Summary

The vocabulary has been successfully built and saved. Here's a summary of the results:

In [None]:
# Final summary
print("VOCABULARY BUILDING COMPLETE!")
print("=" * 60)
print(f"Vocabulary file: {vocab_file}")
print(f"Vocabulary size: {len(vocab):,} words")
print(f"Minimum frequency threshold: {MIN_FREQ}")
print()
print("Coverage Results:")
print(f"  Training:   {train_coverage*100:.2f}% coverage, {train_oov*100:.2f}% OOV")
print(f"  Validation: {valid_coverage*100:.2f}% coverage, {valid_oov*100:.2f}% OOV") 
print(f"  Test:       {test_coverage*100:.2f}% coverage, {test_oov*100:.2f}% OOV")
print()
print("Special tokens:")
for token in vocab.special_tokens:
    if token in vocab.word2idx:
        print(f"  {token}: index {vocab.word2idx[token]}")

print()
print("The vocabulary is ready for training language models!")
print("Next steps:")
print("1. Load vocabulary using: vocab = Vocabulary(); vocab.load('vocab.pkl')")
print("2. Use with data loaders for training")
print("3. Begin model training with LSTM or Transformer architectures")