# Chapter 5 Companion Notebook
**Build Your First LLM — Chapter 5: Your First Python Program**

This notebook bundles the runnable code examples from Chapter 5. Run cells top-to-bottom.

- Installs: transformers (for GPT-2 demo)
- Data: tiny inline strings; no external files needed
- Runtime: CPU is fine; GPU just speeds the GPT-2 call


In [None]:
!pip install -q transformers==4.46.1
import warnings
warnings.filterwarnings('ignore')
from transformers import pipeline, logging
logging.set_verbosity_error()
print('Setup complete')


## Quick win: GPT-2 text generation
Run a tiny GPT-2 generation to see an LLM in action.

In [None]:
generator = pipeline('text-generation', model='gpt2')
result = generator(
    'The secret to building AI is',
    max_new_tokens=20,
    do_sample=True,
    pad_token_id=50256
)
print(result[0]['generated_text'])


## Strings and basic ops
Working with text, lengths, and slices.

In [None]:
output = 'The secret to building AI is understanding how machines learn from data'
prompt = 'The secret to building AI is'
print(len(output))
print(type(output))
words = output.split()
print(words)
words = output.lower().split()
print(words)
generated = output[len(prompt):]
print(f'Generated: {generated.strip()}')
print(f'Word count: {len(generated.split())}')


## Numbers and formatting
Basic numeric values and f-strings.

In [None]:
num_parameters = 124_000_000
learning_rate = 0.0001
vocab_size = 50257
print(f'GPT-2 has {num_parameters:,} parameters')


## Build a tiny vocabulary and tokenizer
From toy sentences to a word-level tokenizer.

In [None]:
texts = [
    'The secret to building AI is',
    'The key to machine learning is data',
    'AI systems learn from examples'
]
all_words = []
for text in texts:
    words = text.lower().split()
    all_words.extend(words)
print(all_words)
print(all_words[0], all_words[-1], all_words[:3])
vocab = {'<PAD>': 0, '<UNK>': 1}
for word in all_words:
    if word not in vocab:
        vocab[word] = len(vocab)
print(f'Vocabulary size: {len(vocab)}')
print(vocab)
print(vocab['the'], vocab['ai'])


## Compare with GPT-2 tokenizer
Show how a production tokenizer differs.

In [None]:
from transformers import GPT2Tokenizer
real_tok = GPT2Tokenizer.from_pretrained('gpt2')
print(f'Our vocab: {len(vocab)} words')
print(f'GPT-2 vocab: {len(real_tok)} tokens')
word = 'neural'
print(f"'{word}' → {vocab.get(word, vocab['<UNK>'])}")
sentence = 'The neural network learns'
token_ids = [vocab.get(w, vocab['<UNK>']) for w in sentence.lower().split()]
print(f'Token IDs: {token_ids}')


## Tokenize and detokenize helpers
Round-trip a sentence.

In [None]:
def tokenize(text, vocab):
    words = text.lower().split()
    return [vocab.get(w, vocab['<UNK>']) for w in words]

def detokenize(ids, vocab):
    id_to_word = {v: k for k, v in vocab.items()}
    return ' '.join(id_to_word.get(i, '<UNK>') for i in ids)

ids = tokenize('The secret to AI', vocab)
print(f'Encoded: {ids}')
print(f'Decoded: {detokenize(ids, vocab)}')
text = 'The secret to AI'
print(f'Our tokens:   {tokenize(text, vocab)}')
print(f'GPT-2 tokens: {real_tok.encode(text)}')


## A minimal tokenizer class
Stateful, word-level tokenizer with fit/encode/decode.

In [None]:
class SimpleTokenizer:
    def __init__(self):
        self.word_to_id = {'<PAD>': 0, '<UNK>': 1}
        self.id_to_word = {0: '<PAD>', 1: '<UNK>'}

    def fit(self, texts):
        for text in texts:
            for word in text.lower().split():
                if word not in self.word_to_id:
                    idx = len(self.word_to_id)
                    self.word_to_id[word] = idx
                    self.id_to_word[idx] = word

    def encode(self, text):
        return [self.word_to_id.get(w, 1) for w in text.lower().split()]

    def decode(self, ids):
        return ' '.join(self.id_to_word.get(i, '<UNK>') for i in ids)

    def __len__(self):
        return len(self.word_to_id)

tok = SimpleTokenizer()
tok.fit(texts)
print(f'Vocabulary size: {len(tok)}')
ids = tok.encode('The secret to AI')
print(f'Encoded: {ids}')
print(f'Decoded: {tok.decode(ids)}')
gpt2_tok = GPT2Tokenizer.from_pretrained('gpt2')
text = 'The secret to AI'
print(f'Your tokenizer:  {tok.encode(text)}')
print(f'GPT-2 tokenizer: {gpt2_tok.encode(text)}')
