# Chapter 5 Companion Notebook
**Build Your First LLM — Chapter 5: Your First Python Program**

This notebook bundles the runnable code examples from Chapter 5. Run cells top-to-bottom.

- Installs: transformers (for GPT-2 demo)
- Data: tiny inline strings; no external files needed
- Runtime: CPU is fine; GPU just speeds the GPT-2 call


In [None]:
# Install transformers library (HuggingFace's LLM toolkit)
!pip install -q transformers==4.46.1

import warnings
warnings.filterwarnings('ignore')  # Silence minor version warnings

# Load HuggingFace tools for GPT-2
from transformers import pipeline, logging
logging.set_verbosity_error()  # Only show real errors, not info messages

print('Setup complete')

## Quick win: GPT-2 text generation
Run a tiny GPT-2 generation to see an LLM in action.

In [None]:
# Load GPT-2 text generation model (124M parameters)
generator = pipeline('text-generation', model='gpt2')

# Generate text from a starting prompt
result = generator(
    'The secret to building AI is',  # Starting text
    max_new_tokens=20,                # Generate 20 more words
    do_sample=True,                   # Use randomness (not just most likely words)
    pad_token_id=50256                # Technical: avoids a warning
)

# Extract and print the generated text
print(result[0]['generated_text'])

## Strings and basic ops
Working with text, lengths, and slices.

In [None]:
# Example output from a text generation model
output = 'The secret to building AI is understanding how machines learn from data'
prompt = 'The secret to building AI is'

# Basic string operations
print(len(output))  # Length in characters
print(type(output))  # Confirm it's a string

# Split into words (list of strings)
words = output.split()
print(words)

# Lowercase then split (normalize text)
words = output.lower().split()
print(words)

# Extract just the generated part (slice from end of prompt)
generated = output[len(prompt):]
print(f'Generated: {generated.strip()}')  # .strip() removes leading spaces
print(f'Word count: {len(generated.split())}')

## Numbers and formatting
Basic numeric values and f-strings.

In [None]:
# Python uses underscores for readability in large numbers
num_parameters = 124_000_000  # 124 million (GPT-2 size)
learning_rate = 0.0001        # Small step size for training
vocab_size = 50257            # GPT-2's vocabulary size

# Format with commas for human-readable output
print(f'GPT-2 has {num_parameters:,} parameters')

## Build a tiny vocabulary and tokenizer
From toy sentences to a word-level tokenizer.

In [None]:
# Sample text data (what you'd train a model on)
texts = [
    'The secret to building AI is',
    'The key to machine learning is data',
    'AI systems learn from examples'
]

# Collect all words from all texts
all_words = []
for text in texts:
    words = text.lower().split()  # Normalize to lowercase
    all_words.extend(words)       # Add to master list

print(all_words)

# List slicing examples
print(all_words[0], all_words[-1], all_words[:3])  # First, last, first 3

# Build vocabulary: map each unique word to a number
vocab = {'<PAD>': 0, '<UNK>': 1}  # Special tokens first (reserved IDs)

for word in all_words:
    if word not in vocab:
        vocab[word] = len(vocab)  # Next available ID

print(f'Vocabulary size: {len(vocab)}')
print(vocab)

# Look up words in the vocabulary
print(vocab['the'], vocab['ai'])  # Returns their IDs

## Compare with GPT-2 tokenizer
Show how a production tokenizer differs.

In [None]:
# Load the real GPT-2 tokenizer
from transformers import GPT2Tokenizer
real_tok = GPT2Tokenizer.from_pretrained('gpt2')

# Compare vocabulary sizes
print(f'Our vocab: {len(vocab)} words')
print(f'GPT-2 vocab: {len(real_tok)} tokens')

# Test unknown word handling
word = 'neural'
print(f"'{word}' → {vocab.get(word, vocab['<UNK>'])}")  # Returns <UNK> ID (1)

# Tokenize a sentence with our vocabulary
sentence = 'The neural network learns'
token_ids = [vocab.get(w, vocab['<UNK>']) for w in sentence.lower().split()]  # List comprehension
print(f'Token IDs: {token_ids}')  # Shows three <UNK> tokens (we don't know "neural", "network", "learns")

## Tokenize and detokenize helpers
Round-trip a sentence.

In [None]:
# Function: Text → Token IDs (encoding)
def tokenize(text, vocab):
    words = text.lower().split()
    return [vocab.get(w, vocab['<UNK>']) for w in words]

# Function: Token IDs → Text (decoding)
def detokenize(ids, vocab):
    # Create reverse mapping (ID → word)
    id_to_word = {v: k for k, v in vocab.items()}
    return ' '.join(id_to_word.get(i, '<UNK>') for i in ids)

# Test round-trip: text → IDs → text
ids = tokenize('The secret to AI', vocab)
print(f'Encoded: {ids}')
print(f'Decoded: {detokenize(ids, vocab)}')

# Compare our tokenizer to GPT-2's
text = 'The secret to AI'
print(f'Our tokens:   {tokenize(text, vocab)}')
print(f'GPT-2 tokens: {real_tok.encode(text)}')  # Different! GPT-2 uses subwords, not whole words

## A minimal tokenizer class
Stateful, word-level tokenizer with fit/encode/decode.

In [None]:
# Object-oriented tokenizer (class bundles data + methods)
class SimpleTokenizer:
    def __init__(self):
        # Initialize vocabulary with special tokens
        self.word_to_id = {'<PAD>': 0, '<UNK>': 1}
        self.id_to_word = {0: '<PAD>', 1: '<UNK>'}

    def fit(self, texts):
        """Build vocabulary from training texts"""
        for text in texts:
            for word in text.lower().split():
                if word not in self.word_to_id:
                    idx = len(self.word_to_id)
                    self.word_to_id[word] = idx  # Add new word
                    self.id_to_word[idx] = word  # Reverse mapping

    def encode(self, text):
        """Convert text to token IDs"""
        return [self.word_to_id.get(w, 1) for w in text.lower().split()]  # 1 = <UNK>

    def decode(self, ids):
        """Convert token IDs back to text"""
        return ' '.join(self.id_to_word.get(i, '<UNK>') for i in ids)

    def __len__(self):
        """Return vocabulary size (enables len(tok))"""
        return len(self.word_to_id)

# Create and train tokenizer
tok = SimpleTokenizer()
tok.fit(texts)  # Learn vocabulary from our training data

print(f'Vocabulary size: {len(tok)}')

# Test encoding/decoding
ids = tok.encode('The secret to AI')
print(f'Encoded: {ids}')
print(f'Decoded: {tok.decode(ids)}')

# Compare to GPT-2 one more time
gpt2_tok = GPT2Tokenizer.from_pretrained('gpt2')
text = 'The secret to AI'
print(f'Your tokenizer:  {tok.encode(text)}')
print(f'GPT-2 tokenizer: {gpt2_tok.encode(text)}')  # GPT-2 uses Byte-Pair Encoding (BPE)