In [3]:

import requests
import time
# Project Gutenberg URLs for classic books
BOOKS = {
    'alice': {
        'url': 'https://www.gutenberg.org/files/11/11-0.txt',
        'title': "Alice's Adventures in Wonderland"
    },
    'pride': {
        'url': 'https://www.gutenberg.org/files/1342/1342-0.txt',
        'title': 'Pride and Prejudice'
    },
    'frankenstein': {
        'url': 'https://www.gutenberg.org/files/84/84-0.txt',
        'title': 'Frankenstein'
    },
    'tale': {
        'url': 'https://www.gutenberg.org/files/98/98-0.txt',
        'title': 'A Tale of Two Cities'
    }
}

def download_book(url: str, title: str) -> str:
    print(f"Downloading '{title}'...")
    try:
        response = requests.get(url, timeout=30)
        response.raise_for_status()
        print(f"✓ Successfully downloaded '{title}' ({len(response.text)} characters)")
        return response.text
    except requests.RequestException as e:
        print(f"✗ Error downloading '{title}': {e}")
        return ""

def clean_gutenberg_text(text: str) -> str: 
    # Find the start of the actual content
    start_markers = [
        "*** START OF THIS PROJECT GUTENBERG",
        "*** START OF THE PROJECT GUTENBERG",
        "*END*THE SMALL PRINT"
    ]
    
    start_idx = 0
    for marker in start_markers:
        idx = text.find(marker)
        if idx != -1:
            # Skip past the marker line
            start_idx = text.find('\n', idx) + 1
            break
    
    # Find the end of the content
    end_markers = [
        "*** END OF THIS PROJECT GUTENBERG",
        "*** END OF THE PROJECT GUTENBERG",
        "End of Project Gutenberg",
        "End of the Project Gutenberg"
    ]
    
    end_idx = len(text)
    for marker in end_markers:
        idx = text.find(marker)
        if idx != -1:
            end_idx = idx
            break
    
    # Extract the main content
    content = text[start_idx:end_idx]
    
    return content.strip()

def train_model(book_key: str = 'alice'):# -> TrigramModel:
    if book_key not in BOOKS:
        raise ValueError(f"Unknown book key: {book_key}. Choose from {list(BOOKS.keys())}")
    
    book_info = BOOKS[book_key]
    
    # Download the book
    raw_text = download_book(book_info['url'], book_info['title'])
    
    if not raw_text:
        raise RuntimeError("Failed to download book")
    
    # Clean Gutenberg metadata
    text = clean_gutenberg_text(raw_text)
    print(f"Cleaned text: {len(text)} characters")
    
    # Train the model
    print("\nTraining trigram model...")
    start_time = time.time()
    
    model = TrigramModel()
    model.fit(text)
    
    training_time = time.time() - start_time
    print(f"✓ Training completed in {training_time:.2f} seconds")
    
    # Display model statistics
    stats = model.get_model_stats()
    print("\nModel Statistics:")
    print(f"  Vocabulary size: {stats['vocabulary_size']:,}")
    print(f"  Unique contexts: {stats['unique_contexts']:,}")
    print(f"  Total trigrams: {stats['total_trigrams']:,}")
    print(f"  Avg next words per context: {stats['avg_next_words_per_context']:.2f}")
    
    return model

def generate_samples(model: TrigramModel, num_samples: int = 5, max_length: int = 50):
    """
    Generates sample texts from the trained model.
    
    Args:
        model: Trained TrigramModel instance
        num_samples: Number of samples to generate
        max_length: Maximum length for each sample
    """
    print(f"\n{'='*70}")
    print(f"Generating {num_samples} sample texts:")
    print(f"{'='*70}\n")
    
    for i in range(num_samples):
        text = model.generate(max_length=max_length, seed=i)
        print(f"Sample {i+1}:")
        print(f"{text}\n")
        print(f"{'-'*70}\n")

def main():
    """Main execution function."""
    print("="*70)
    print("Trigram Language Model - Training Script")
    print("="*70)
    print("\nAvailable books:")
    for key, info in BOOKS.items():
        print(f"  {key}: {info['title']}")
    
    # You can change this to any book key
    book_choice = 'alice'  # Change to 'pride', 'frankenstein', or 'tale'
    
    print(f"\nTraining on: {BOOKS[book_choice]['title']}")
    print("="*70 + "\n")
    
    # Train the model
    model = train_model(book_choice)
    
    # Generate sample texts
    generate_samples(model, num_samples=5, max_length=50)
    
    print("\n✓ All done!")

if __name__ == "__main__":
    main()

NameError: name 'TrigramModel' is not defined