In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from get_device import get_device

# Use CUDA if available
device = get_device()
print(f"Using device: {device}")

Using device: mps


In [2]:
from pathlib import Path

text = Path('../../data/tiny-shakespeare.txt').read_text()

In [3]:
print(text[0:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [4]:

class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)


  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [5]:
tokenizer = CharTokenizer.train_from_text(text)

In [6]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([20, 43, 50, 50, 53,  1, 61, 53, 56, 50, 42])
Hello world


In [7]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 65


In [8]:
from torch.utils.data import Dataset

class TokenIdsDataset(Dataset):
  def __init__(self, data, block_size):
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data) - self.block_size

  def __getitem__(self, pos):
    assert pos < len(self.data) - self.block_size

    x = self.data[pos:pos + self.block_size]
    y = self.data[pos + 1:pos + 1 + self.block_size]
    return x, y

In [9]:
config = {
  "vocabulary_size": tokenizer.vocabulary_size(),
  "context_size": 256,
  "embedding_dim": 768,
  "heads_num": 12,
  "layers_num": 10,
  "dropout_rate": 0.1,
  "use_bias": False,
}

config["head_size"] = config["embedding_dim"] // config["heads_num"]

In [10]:
class AttentionHead(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.Q_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])
    self.K_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])
    self.V_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])

    self.dropout = nn.Dropout(config["dropout_rate"])

    casual_attention_mask = torch.tril(torch.ones(config["context_size"], config["context_size"]))
    self.register_buffer('casual_attention_mask', casual_attention_mask)

  def forward(self, input):
    batch_size, tokens_num, embedding_dim = input.shape
    Q = self.Q_weights(input)
    K = self.K_weights(input)
    V = self.V_weights(input)

    attention_scores = Q @ K.transpose(1, 2)
    attention_scores = attention_scores.masked_fill(
        self.casual_attention_mask[:tokens_num,:tokens_num] == 0,
        -torch.inf
    )
    attention_scores = attention_scores / ( K.shape[-1] ** 0.5 )
    attention_scores = torch.softmax(attention_scores, dim=-1)
    attention_scores = self.dropout(attention_scores)

    return attention_scores @ V

In [11]:
input = torch.rand(8, config["context_size"], config["embedding_dim"])

In [12]:
ah = AttentionHead(config)

In [13]:
output = ah(input)

In [14]:
output.shape

torch.Size([8, 256, 64])

In [15]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()

    heads_list = [AttentionHead(config) for _ in range(config["heads_num"])]
    self.heads = nn.ModuleList(heads_list)

    self.linear = nn.Linear(config["embedding_dim"], config["embedding_dim"])
    self.dropout = nn.Dropout(config["dropout_rate"])

  def forward(self, input):
    # print(f"Input shape: {input.shape}")
    heads_outputs = [head(input) for head in self.heads]

    scores_change = torch.cat(heads_outputs, dim=-1)
    # print(f"heads shape: {scores_change.shape}")

    scores_change = self.linear(scores_change)
    return self.dropout(scores_change)

In [16]:
mha = MultiHeadAttention(config)

In [17]:
input = torch.rand(8, config["context_size"], config["embedding_dim"])

In [18]:
output = mha(input)

In [19]:
output.shape

torch.Size([8, 256, 768])

In [20]:
class FeedForward(nn.Module):

  def __init__(self, config):
    super().__init__()

    self.linear_layers = nn.Sequential(
        nn.Linear(config["embedding_dim"], config["embedding_dim"] * 4),
        nn.GELU(),
        nn.Linear(config["embedding_dim"] * 4, config["embedding_dim"]),
        nn.Dropout(config["dropout_rate"])
    )

  def forward(self, input):
    return self.linear_layers(input)

In [21]:
ff = FeedForward(config)

In [22]:
input = torch.rand(8, config["context_size"], config["embedding_dim"])

In [None]:
output = ff(input)

In [24]:
output.shape

torch.Size([8, 256, 768])

In [25]:
class Block(nn.Module):

  def __init__(self, config):
    super().__init__()

    self.multi_head = MultiHeadAttention(config)
    self.layer_norm_1 = nn.LayerNorm(config["embedding_dim"])

    self.feed_forward = FeedForward(config)
    self.layer_norm_2 = nn.LayerNorm(config["embedding_dim"])

  def forward(self, input):
    residual = input
    x = self.multi_head(self.layer_norm_1(input))
    x = x + residual

    residual = x
    x = self.feed_forward(self.layer_norm_2(x))
    return x + residual

In [26]:
b = Block(config)

In [27]:
ouptut = b(input)

In [28]:
output.shape

torch.Size([8, 256, 768])


---

# Complete GPT Model Architecture

###### Model Overview

The `DemoGPT` class implements a complete transformer-based language model following the GPT (Generative Pre-trained Transformer) architecture. It combines token embeddings, positional embeddings, multiple transformer blocks, and an output projection to perform next-token prediction.

###### Component Breakdown

**Embedding Layers:**
```python
self.token_embedding_layer = nn.Embedding(config["vocabulary_size"], config["embedding_dim"])
self.positional_embedding_layer = nn.Embedding(config["context_size"], config["embedding_dim"])
```

- **Token embeddings**: Map each token ID to a dense vector representation
- **Positional embeddings**: Add position-specific information to distinguish token order

**Transformer Stack:**
```python
blocks = [Block(config) for _ in range(config["layers_num"])]
self.layers = nn.Sequential(*blocks)
```

Creates a stack of transformer blocks (typically 10-96 layers) for deep processing.

**Output Processing:**
```python
self.layer_norm = nn.LayerNorm(config["embedding_dim"])
self.unembedding = nn.Linear(config["embedding_dim"], config["vocabulary_size"], bias=False)
```

- **Final layer norm**: Stabilizes the output representations
- **Unembedding**: Projects back to vocabulary space for token prediction

###### Forward Pass Analysis

**Step 1: Token Embedding**
```python
x = self.token_embedding_layer(token_ids)
```
- Input: `(batch_size, sequence_length)` of token IDs
- Output: `(batch_size, sequence_length, embedding_dim)` of dense vectors

**Step 2: Positional Embedding Addition**
```python
sequence = torch.arange(tokens_num, device=device)
x = x + self.positional_embedding_layer(sequence)
```

**Issue Alert**: The code has a bug - `device` is not defined in the method scope. Should be:
```python
sequence = torch.arange(tokens_num, device=token_ids.device)
```

**Mathematical Operation:**
Each position gets both content and positional information:
$$\text{input\_to\_blocks} = \text{TokenEmb}(\text{token\_ids}) + \text{PosEmb}(\text{positions})$$

**Step 3: Transformer Processing**
```python
x = self.layers(x)
```
Passes through all transformer blocks sequentially, with each block applying attention and feed-forward transformations.

**Step 4: Output Normalization**
```python
x = self.layer_norm(x)
```
Final layer normalization ensures stable representations before output projection.

**Step 5: Vocabulary Projection**
```python
x = self.unembedding(x)
```
- Maps from embedding space back to vocabulary space
- Output: `(batch_size, sequence_length, vocabulary_size)`
- Each position gets a probability distribution over all possible next tokens

###### Key Architecture Decisions

**Parameter Sharing:**
The model uses separate embedding matrices for tokens and positions, allowing independent learning of semantic and positional representations.

**No Bias in Final Layer:**
```python
bias=False
```
Common practice in modern language models to reduce parameters and improve training dynamics.

**Additive Positional Encoding:**
Uses learned positional embeddings added to token embeddings, rather than the sinusoidal encodings from the original transformer paper.

###### Dimensional Flow Example

```python
# Example with config: vocabulary_size=50000, embedding_dim=768, context_size=1024
# Input token_ids: (32, 512)  # batch_size=32, sequence_length=512

# After token embedding: (32, 512, 768)
# After positional embedding addition: (32, 512, 768)
# After transformer blocks: (32, 512, 768)
# After final layer norm: (32, 512, 768)  
# After unembedding: (32, 512, 50000)  # Logits for each vocabulary token
```

###### Training Usage

During training, the output logits are used with cross-entropy loss:
```python
# model output: (batch_size, sequence_length, vocab_size)
# targets: (batch_size, sequence_length)
loss = criterion(logits.view(-1, vocab_size), targets.view(-1))
```

###### Generation Usage

For text generation, the model predicts one token at a time:
```python
# Get logits for last position
next_token_logits = model(input_ids)[:, -1, :]  # (batch_size, vocab_size)
# Sample or take argmax to get next token
next_token = torch.multinomial(torch.softmax(next_token_logits, dim=-1), 1)
```

This architecture represents a complete autoregressive language model capable of learning complex language patterns and generating coherent text through next-token prediction.

---

In [29]:
import torch
import torch.nn as nn

class DemoGPT(nn.Module):
  """
  A complete, simplified implementation of a GPT-style transformer model.

  This class brings together all the necessary components:
  1. Token and Positional Embeddings to create the initial input representation.
  2. A stack of Transformer `Block`s to perform the core processing.
  3. A final Layer Normalization and a linear layer (unembedding) to project
     the output back into the vocabulary space to get logits for the next token.

  The model is designed for auto-regressive language generation, predicting the
  next token in a sequence given the previous ones.
  """
  def __init__(self, config):
    """
    Initializes the DemoGPT model architecture.

    Args:
        config (dict): A configuration dictionary containing model hyperparameters:
            - "vocabulary_size" (int): The number of unique tokens in the tokenizer.
            - "embedding_dim" (int): The dimensionality of the token and positional embeddings.
            - "context_size" (int): The maximum sequence length the model can handle.
            - "layers_num" (int): The number of Transformer `Block`s to stack.
            - Other keys required by the `Block` class.
    """
    super().__init__()

    # Token embedding layer: maps each token ID to a dense vector.
    self.token_embedding_layer = nn.Embedding(config["vocabulary_size"], config["embedding_dim"])
    # Positional embedding layer: maps each position index (0 to context_size-1) to a vector.
    self.positional_embedding_layer = nn.Embedding(config["context_size"], config["embedding_dim"])

    # Create a stack of Transformer Blocks.
    # `nn.Sequential` chains the blocks together, so the output of one is the input to the next.
    blocks = [Block(config) for _ in range(config["layers_num"])]
    self.layers = nn.Sequential(*blocks)

    # A final layer normalization applied after the transformer blocks.
    self.layer_norm = nn.LayerNorm(config["embedding_dim"])
    # The final linear layer (unembedding) that projects the model's output
    # back to the vocabulary size to get the logits for each token.
    self.unembedding = nn.Linear(config["embedding_dim"], config["vocabulary_size"], bias=False)

  def forward(self, token_ids):
    """
    Performs the forward pass of the DemoGPT model.

    Args:
        token_ids (torch.Tensor): A tensor of shape (B, T) containing the
            input token IDs, where B is batch size and T is sequence length.

    Returns:
        torch.Tensor: The output logits tensor of shape (B, T, V), where V is
            the vocabulary size.
    """
    batch_size, tokens_num = token_ids.shape

    # 1. Get token embeddings for the input IDs. Shape: (B, T, E)
    token_embeddings = self.token_embedding_layer(token_ids)
    
    # 2. Get positional embeddings for each position in the sequence.
    # `torch.arange` creates a sequence of position indices [0, 1, ..., T-1].
    sequence = torch.arange(tokens_num, device=token_ids.device)
    positional_embeddings = self.positional_embedding_layer(sequence) # Shape: (T, E)
    
    # 3. Add token and positional embeddings. Broadcasting adds the positional
    # embeddings to each sequence in the batch. Shape: (B, T, E)
    x = token_embeddings + positional_embeddings

    # 4. Pass the combined embeddings through the stack of Transformer blocks.
    x = self.layers(x)
    
    # 5. Apply the final layer normalization.
    x = self.layer_norm(x)
    
    # 6. Project the final hidden states to logits over the vocabulary.
    logits = self.unembedding(x) # Shape: (B, T, V)

    return logits

In [30]:
model = DemoGPT(config).to(device)

In [31]:
output = model(tokenizer.encode("Hi").unsqueeze(dim=0).to(device))

In [32]:
output.shape

torch.Size([1, 2, 65])


---

# Autoregressive Text Generation Function

###### Purpose and Functionality

The `generate` function implements autoregressive text generation for language models, producing new tokens one at a time by sampling from the model's predicted probability distributions. This is the standard approach for generating coherent text sequences from transformer-based language models.

###### Function Parameters

```python
def generate(model, prompt_ids, max_tokens):
```

- **model**: The trained GPT model instance
- **prompt_ids**: Initial token sequence to start generation (shape: `(1, prompt_length)`)
- **max_tokens**: Maximum number of new tokens to generate

###### Step-by-Step Generation Process

**Initialization:**
```python
output_ids = prompt_ids
```
Starts with the provided prompt as the foundation for generation.

**Generation Loop:**
```python
for _ in range(max_tokens):
```
Iteratively generates tokens up to the specified maximum.

**Context Length Check:**
```python
if output_ids.shape[1] >= config["context_size"]:
    break
```
Prevents exceeding the model's maximum context window (e.g., 1024 tokens).

**Forward Pass (No Gradients):**
```python
with torch.no_grad():
    logits = model(output_ids)
```
- Disables gradient computation for efficiency during inference
- Gets model predictions for all positions in the sequence
- Output shape: `(batch_size, sequence_length, vocabulary_size)`

**Next Token Prediction:**
```python
logits = logits[:, -1, :]  # Extract last position logits
probs = F.softmax(logits, dim=-1)  # Convert to probabilities
```
- Extracts logits for the last position only (next token prediction)
- Applies softmax to convert raw logits to probability distribution
- Result shape: `(batch_size, vocabulary_size)`

**Sampling Strategy:**
```python
next_token_id = torch.multinomial(probs, num_samples=1)
```
Uses multinomial sampling to select next token based on probability distribution rather than always choosing the highest probability token (greedy decoding).

**Sequence Extension:**
```python
output_ids = torch.cat([output_ids, next_token_id], dim=-1)
```
Appends the newly generated token to the existing sequence for the next iteration.

###### Sampling vs. Greedy Decoding

**Multinomial Sampling (Used Here):**
- Introduces randomness and diversity in generation
- Tokens with higher probability are more likely to be selected
- Produces more creative and varied outputs
- Can occasionally select lower-probability but contextually interesting tokens

**Greedy Decoding (Alternative):**
```python
next_token_id = torch.argmax(logits, dim=-1, keepdim=True)
```
- Always selects the highest probability token
- Deterministic output (same input always produces same output)
- Often leads to repetitive or predictable text

###### Generation Example Flow

```python
# Initial prompt: "The cat sat on the"
# prompt_ids: [464, 3857, 3332, 319, 262]  # Token IDs

# Iteration 1:
# Model predicts probabilities: [mat: 0.4, chair: 0.3, floor: 0.2, ...]
# Sample → "mat" (token_id: 2603)
# output_ids: [464, 3857, 3332, 319, 262, 2603]

# Iteration 2:
# Model sees "The cat sat on the mat"
# Predicts next token probabilities: [and: 0.5, while: 0.2, .: 0.15, ...]
# Sample → "and" (token_id: 290)
# output_ids: [464, 3857, 3332, 319, 262, 2603, 290]
```

###### Key Design Considerations

**Memory Efficiency:**
Using `torch.no_grad()` prevents unnecessary gradient computation and memory allocation during inference.

**Context Management:**
The function respects the model's context length limit, preventing out-of-bounds errors.

**Stochastic Generation:**
Multinomial sampling introduces controlled randomness, balancing coherence with creativity.

**Incremental Processing:**
Each iteration processes the entire sequence, allowing the model to consider full context when predicting the next token.

This generation approach forms the foundation for interactive chatbots, creative writing assistants, and other applications requiring coherent text continuation from language models.

---

In [33]:
import torch
import torch.nn.functional as F

def generate(model, prompt_ids, max_tokens_to_generate, config):
    """
    Generates a sequence of tokens auto-regressively from a given prompt.

    This function takes a trained model and a starting sequence of token IDs
    (the prompt) and generates new tokens one by one. In each step, it uses
    the model to predict the next token, samples from the resulting probability
    distribution, and appends the new token to the sequence, which then becomes
    the input for the next step.

    Args:
        model (nn.Module): The trained DemoGPT transformer model.
        prompt_ids (torch.Tensor): A tensor of shape (B, T) containing the
            initial token IDs to start generation from. B is the batch size
            (usually 1 for generation) and T is the length of the prompt.
        max_tokens_to_generate (int): The maximum number of new tokens to generate after the prompt.
        config (dict): The model's configuration dictionary, used to access
            the `context_size`.

    Returns:
        torch.Tensor: A tensor of shape (B, T + generated_tokens) containing
            the original prompt plus the newly generated tokens.
    """
    # Start with the initial prompt.
    output_ids = prompt_ids
    
    # Loop to generate tokens one by one.
    for _ in range(max_tokens_to_generate):
      # Stop if the context window is full.
      if output_ids.shape[1] >= config["context_size"]:
        break
        
      # Use torch.no_grad() to disable gradient calculations, as we are only
      # doing inference, which saves memory and computation.
      with torch.no_grad():
        # Get the model's predictions (logits) for the current sequence.
        logits = model(output_ids)

      # Focus only on the logits for the very last token in the sequence,
      # as that's the prediction for the *next* token.
      last_token_logits = logits[:, -1, :]
      
      # Apply softmax to convert the logits into a probability distribution.
      probs = F.softmax(last_token_logits, dim=-1)
      
      # Sample one token from the probability distribution.
      # `torch.multinomial` treats the input as a set of weights for sampling.
      next_token_id = torch.multinomial(probs, num_samples=1)
      
      # Append the newly sampled token ID to our sequence.
      output_ids = torch.cat([output_ids, next_token_id], dim=1)
      
    return output_ids

In [43]:
# def generate_with_prompt(model, tokenizer, prompt, max_tokens=100):
#   model.eval()

#   prompt = tokenizer.encode(prompt).unsqueeze(dim=0).to(device)

#   return tokenizer.decode(generate(model, prompt, max_tokens=max_tokens)[0])


def generate_with_prompt(model, tokenizer, config, prompt, max_tokens_to_generate=100):
  """
  Generates text from a prompt using the specified model and tokenizer.

  This function sets the model to evaluation mode, encodes the prompt, calls
  the `generate` function to produce token IDs, and decodes them back into
  human-readable text.

  Args:
      model (nn.Module): The trained transformer model.
      tokenizer (CharTokenizer): The tokenizer for encoding/decoding text.
      config (dict): The model's configuration dictionary.
      prompt (str): The initial text to start generation from.
      max_tokens_to_generate (int): The maximum number of new tokens to create.

  Returns:
      str: The generated text, including the original prompt.
  """
  model.eval()

  prompt_ids = tokenizer.encode(prompt).unsqueeze(dim=0).to(device)

  # Call the generate function with the correct arguments
  generated_ids = generate(
      model,
      prompt_ids,
      max_tokens_to_generate=max_tokens_to_generate,
      config=config
  )

  return tokenizer.decode(generated_ids[0])

In [45]:
# generate_with_prompt(model, tokenizer, "First Citizen:\n")

generate_with_prompt(model, tokenizer, config, "First Citizen:\n")

"First Citizen:\n.Mnr\nS'cW-FyGGuvZ: LMkGRvPGRarz eu;cdrkUt FcwNa! XVStOhgg!!Yw-ForftaIqIvv,zn;hAitYBYxiRkBEOqn-MC:tc\n"