In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from get_device import get_device


# Use CUDA if available
device = get_device()

In [5]:
from pathlib import Path

text = Path('../../data/tiny-shakespeare.txt').read_text()

In [6]:
print(text[0:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [7]:

class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)


  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [8]:
tokenizer = CharTokenizer.train_from_text(text)

In [9]:
print(tokenizer.encode("Hello world"))
print(tokenizer.decode(tokenizer.encode("Hello world")))

tensor([20, 43, 50, 50, 53,  1, 61, 53, 56, 50, 42])
Hello world


In [10]:
print(f"Vocabulary size: {tokenizer.vocabulary_size()}")

Vocabulary size: 65


In [11]:
from torch.utils.data import Dataset

class TokenIdsDataset(Dataset):
  def __init__(self, data, block_size):
    self.data = data
    self.block_size = block_size

  def __len__(self):
    return len(self.data) - self.block_size

  def __getitem__(self, pos):
    assert pos < len(self.data) - self.block_size

    x = self.data[pos:pos + self.block_size]
    y = self.data[pos + 1:pos + 1 + self.block_size]
    return x, y

In [12]:
config = {
  "vocabulary_size": tokenizer.vocabulary_size(),
  "context_size": 256,
  "embedding_dim": 768,
  "heads_num": 12,
  "layers_num": 10,
  "dropout_rate": 0.1,
  "use_bias": False,
}

config["head_size"] = config["embedding_dim"] // config["heads_num"]

In [13]:
class AttentionHead(nn.Module):
  def __init__(self, config):
    super().__init__()
    self.Q_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])
    self.K_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])
    self.V_weights = nn.Linear(config["embedding_dim"], config["head_size"], config["use_bias"])

    self.dropout = nn.Dropout(config["dropout_rate"])

    casual_attention_mask = torch.tril(torch.ones(config["context_size"], config["context_size"]))
    self.register_buffer('casual_attention_mask', casual_attention_mask)

  def forward(self, input):
    batch_size, tokens_num, embedding_dim = input.shape
    Q = self.Q_weights(input)
    K = self.K_weights(input)
    V = self.V_weights(input)

    attention_scores = Q @ K.transpose(1, 2)
    attention_scores = attention_scores.masked_fill(
        self.casual_attention_mask[:tokens_num,:tokens_num] == 0,
        -torch.inf
    )
    attention_scores = attention_scores / ( K.shape[-1] ** 0.5 )
    attention_scores = torch.softmax(attention_scores, dim=-1)
    attention_scores = self.dropout(attention_scores)

    return attention_scores @ V

In [15]:
input = torch.rand(8, config["context_size"], config["embedding_dim"])

ah = AttentionHead(config)

output = ah(input)
output.shape

torch.Size([8, 256, 64])

In [16]:
class MultiHeadAttention(nn.Module):
  def __init__(self, config):
    super().__init__()

    heads_list = [AttentionHead(config) for _ in range(config["heads_num"])]
    self.heads = nn.ModuleList(heads_list)

    self.linear = nn.Linear(config["embedding_dim"], config["embedding_dim"])
    self.dropout = nn.Dropout(config["dropout_rate"])

  def forward(self, input):
    # print(f"Input shape: {input.shape}")
    heads_outputs = [head(input) for head in self.heads]

    scores_change = torch.cat(heads_outputs, dim=-1)
    # print(f"heads shape: {scores_change.shape}")

    scores_change = self.linear(scores_change)
    return self.dropout(scores_change)

In [17]:
mha = MultiHeadAttention(config)

In [18]:
input = torch.rand(8, config["context_size"], config["embedding_dim"])

In [19]:
output = mha(input)

In [20]:
output.shape

torch.Size([8, 256, 768])


---

#### Feed-Forward Network in Transformer Architecture

```python
class FeedForward(nn.Module):
    """
    Position-wise feed-forward network used in transformer blocks.
    
    This module implements a two-layer fully connected network with GELU activation
    that processes each position in the sequence independently. It expands the 
    embedding dimension by a factor of 4, applies non-linear activation, then
    projects back to the original dimension.
    
    The architecture follows the standard transformer design:
    embedding_dim → 4×embedding_dim → embedding_dim
    
    Args:
        config (dict): Configuration dictionary containing:
            - embedding_dim (int): The input/output dimension size
            - dropout_rate (float): Dropout probability for regularization
    """
```

##### Architecture Overview

**Two-Layer Structure:**
```python
nn.Linear(config["embedding_dim"], config["embedding_dim"] * 4),  # Expansion
nn.GELU(),                                                        # Activation
nn.Linear(config["embedding_dim"] * 4, config["embedding_dim"]), # Contraction
nn.Dropout(config["dropout_rate"])                               # Regularization
```

**Dimension Transformation:**
- Input: `(batch_size, sequence_length, embedding_dim)`
- Hidden: `(batch_size, sequence_length, embedding_dim × 4)`
- Output: `(batch_size, sequence_length, embedding_dim)`

##### GELU Activation Function

**GELU (Gaussian Error Linear Unit)** is defined mathematically as:
$$\text{GELU}(x) = x \cdot \Phi(x)$$
where $\Phi(x)$ is the cumulative distribution function of the standard normal distribution.

**Approximation used in practice:**
$$\text{GELU}(x) \approx 0.5x\left(1 + \tanh\left(\sqrt{\frac{2}{\pi}}(x + 0.044715x^3)\right)\right)$$

**Properties of GELU:**
- **Smooth**: Unlike ReLU, GELU is differentiable everywhere
- **Non-monotonic**: Has a slight negative region for small negative inputs
- **Probabilistic**: Based on probability theory rather than hard thresholding
- **Better gradients**: Smoother gradients compared to ReLU variants

**Comparison with other activations:**
```python
# ReLU: max(0, x) - hard cutoff at zero
# GELU: x * Φ(x) - smooth probabilistic gating
# Swish: x * sigmoid(x) - similar smooth properties
```

##### Why 4x Expansion?

**Computational Capacity:**
The 4x expansion provides sufficient representational capacity for complex transformations while maintaining computational efficiency.

**Parameter Distribution:**
```python
# Example with embedding_dim = 768
# Layer 1: 768 → 3072 (768 × 4) = 2,359,296 parameters
# Layer 2: 3072 → 768 = 2,359,296 parameters  
# Total: ~4.7M parameters per feed-forward block
```

**Information Processing:**
- **Expansion phase**: Projects to higher-dimensional space for complex pattern recognition
- **Contraction phase**: Compresses back to original dimension while preserving learned features

##### Position-wise Processing

**Independent Processing:**
Each position in the sequence is processed independently - the same transformation is applied to every token position without interaction between positions.

**Mathematical representation:**
$$\text{FFN}(x) = \max(0, xW_1 + b_1)W_2 + b_2$$
(using ReLU in original paper, but GELU in modern implementations)

##### Role in Transformer Architecture

**Complementary to Attention:**
- **Attention**: Models relationships between positions
- **Feed-forward**: Processes individual positions with non-linear transformations

**Residual Connection Context:**
The feed-forward output is typically added to its input via residual connections:
```python
# In transformer block:
x = x + attention(x)
x = x + feedforward(x)  # This module
```

**Learning Capacity:**
The feed-forward network often contains the majority of parameters in a transformer model, providing substantial learning capacity for pattern recognition and feature transformation.

This component is essential for transformers to learn complex non-linear transformations while maintaining the ability to process variable-length sequences efficiently.

---

In [21]:
class FeedForward(nn.Module):
  """
  Position-wise feed-forward network used in transformer blocks.
  
  This module implements a two-layer fully connected network with GELU activation
  that processes each position in the sequence independently. It expands the 
  embedding dimension by a factor of 4, applies non-linear activation, then
  projects back to the original dimension.
  
  The architecture follows the standard transformer design:
  embedding_dim → 4×embedding_dim → embedding_dim
  
  Args:
      config (dict): Configuration dictionary containing:
          - embedding_dim (int): The input/output dimension size
          - dropout_rate (float): Dropout probability for regularization
  """

  def __init__(self, config):
    super().__init__()

    self.linear_layers = nn.Sequential(
        nn.Linear(config["embedding_dim"], config["embedding_dim"] * 4),
        nn.GELU(),
        nn.Linear(config["embedding_dim"] * 4, config["embedding_dim"]),
        nn.Dropout(config["dropout_rate"])
    )

  def forward(self, input):
    return self.linear_layers(input)

In [None]:
ff = FeedForward(config)

input = torch.rand(8, config["context_size"], config["embedding_dim"])
output = ff(input)

output.shape


---

# Transformer Block: Complete Processing Unit

###### Core Architecture

The `Block` class implements a complete transformer layer that combines self-attention and feed-forward processing with residual connections and layer normalization. This represents one layer of a multi-layer transformer architecture.

###### Component Structure

**Two Main Sub-modules:**
```python
self.multi_head = MultiHeadAttention(config)    # Attention mechanism
self.feed_forward = FeedForward(config)         # Position-wise processing
```

**Normalization Layers:**
```python
self.layer_norm_1 = nn.LayerNorm(config["embedding_dim"])  # Pre-attention norm
self.layer_norm_2 = nn.LayerNorm(config["embedding_dim"])  # Pre-feedforward norm
```

###### Pre-Norm Architecture

The implementation uses **Pre-LayerNorm** architecture (also called Pre-LN), where normalization is applied before each sub-module rather than after:

**Pre-Norm Flow:**
```python
# Attention sub-layer
residual = input
x = self.multi_head(self.layer_norm_1(input))  # Norm → Attention
x = x + residual                               # Add residual

# Feed-forward sub-layer  
residual = x
x = self.feed_forward(self.layer_norm_2(x))    # Norm → FFN
return x + residual                            # Add residual
```

**vs Post-Norm (original transformer):**
```python
# Would be: x = self.layer_norm_1(x + self.multi_head(x))
# Would be: x = self.layer_norm_2(x + self.feed_forward(x))
```

###### Forward Pass Breakdown

**Step 1: Attention Processing**
```python
residual = input                                    # Store original input
x = self.multi_head(self.layer_norm_1(input))     # Normalize → Attention
x = x + residual                                   # Add residual connection
```

- Input shape: `(batch_size, sequence_length, embedding_dim)`
- LayerNorm normalizes across embedding dimension for each token
- Multi-head attention processes normalized input
- Residual connection preserves original information

**Step 2: Feed-Forward Processing**
```python
residual = x                                       # Store attention output
x = self.feed_forward(self.layer_norm_2(x))      # Normalize → FFN
return x + residual                               # Add residual connection
```

- Takes attention output as input
- Applies second normalization layer
- Feed-forward network processes each position independently
- Final residual connection completes the block

###### Layer Normalization Details

**LayerNorm Operation:**
For each token position, normalizes across the embedding dimension:
$$\text{LayerNorm}(x) = \frac{x - \mu}{\sigma} \cdot \gamma + \beta$$

Where:
- $\mu$ = mean across embedding dimension
- $\sigma$ = standard deviation across embedding dimension
- $\gamma, \beta$ = learnable scale and shift parameters

**Example:**
```python
# For one token with embedding_dim=768
token_embedding = [0.1, 0.5, -0.2, ..., 0.3]  # 768 values
# LayerNorm computes mean and std of these 768 values
# Then normalizes: (value - mean) / std
```

###### Residual Connections

**Purpose:**
- Enables gradient flow through deep networks
- Allows model to learn identity mappings when needed
- Provides multiple paths for information flow

**Mathematical Effect:**
Each sub-layer computes: $\text{output} = x + \text{SubLayer}(x)$

This means the sub-layer only needs to learn the "change" or "refinement" to apply to the input, rather than reconstructing the entire representation.

###### Why Pre-Norm Architecture?

**Training Stability:**
- Better gradient flow during training
- Reduced gradient exploding/vanishing problems
- More stable training in deeper models

**Normalization Benefits:**
- Normalizes inputs to sub-layers rather than outputs
- Ensures sub-layers receive well-conditioned inputs
- Often converges faster than Post-Norm

###### Information Flow Through Block

**Multi-Path Processing:**
1. **Attention path**: Models relationships between tokens
2. **Feed-forward path**: Processes individual token representations
3. **Residual paths**: Preserve original information at each step

**Cumulative Effect:**
Each block refines the representation while preserving previous learning through residual connections, allowing the model to build increasingly sophisticated representations layer by layer.

This block design is the fundamental building unit of transformer architectures, with models typically stacking 6-96 such blocks to create powerful language models.


##### Data Flow Diagram

The data flow for the Pre-LayerNorm architecture implemented in the code. Here's the diagram:

```mermaid
graph TD
    A["Input"] --> B["LayerNorm 1"]
    B --> C["MultiHeadAttention"]
    C --> D["Add (Residual 1)"]
    A --> D
    D --> E["LayerNorm 2"]
    E --> F["FeedForward"]
    F --> G["Add (Residual 2)"]
    D --> G
    G --> H["Output"]
    
    style A fill:#F1F8E9
    style B fill:#DCEDC8
    style C fill:#C5E1A5
    style D fill:#AED581
    style E fill:#9CCC65
    style F fill:#8BC34A
    style G fill:#7CB342
    style H fill:#CDDC39
```

**Data Flow Verification:**

1. **Input** → **LayerNorm 1** → **MultiHeadAttention** → **Add (with input)** 
2. **Result** → **LayerNorm 2** → **FeedForward** → **Add (with previous result)** → **Output**

This matches exactly with the code implementation:

```python
# First sub-layer
residual = input
x = self.multi_head(self.layer_norm_1(input))  # LayerNorm → Attention
x = x + residual                               # Add residual

# Second sub-layer  
residual = x
x = self.feed_forward(self.layer_norm_2(x))    # LayerNorm → FFN
return x + residual                            # Add residual
```

The diagram shows the Pre-LayerNorm architecture where normalization occurs before each sub-module, with residual connections bypassing the normalized paths.

---

In [27]:
import torch.nn as nn

class Block(nn.Module):
    """
    A single Transformer block, which is a fundamental building block of a Transformer model.

    This module encapsulates two main sub-layers:
    1. A Multi-Head Self-Attention mechanism.
    2. A position-wise Feed-Forward Network.

    Each sub-layer is followed by a residual connection and layer normalization,
    a technique often referred to as "Pre-LN" (pre-layer normalization). This
    structure helps stabilize training and allows for deeper models.

    The data flow is as follows:
    input -> LayerNorm -> MultiHeadAttention -> Add -> LayerNorm -> FeedForward -> Add -> output
    """

    def __init__(self, config):
        """
        Initializes the Transformer Block.

        Args:
            config (dict): A configuration dictionary containing parameters for
                the sub-modules, such as "embedding_dim", "heads_num", etc.
        """
        super().__init__()

        # The first sub-layer: Multi-Head Attention.
        self.multi_head = MultiHeadAttention(config)
        # Layer normalization applied *before* the attention mechanism.
        self.layer_norm_1 = nn.LayerNorm(config["embedding_dim"])

        # The second sub-layer: a simple Feed-Forward Network.
        self.feed_forward = FeedForward(config)
        # Layer normalization applied *before* the feed-forward network.
        self.layer_norm_2 = nn.LayerNorm(config["embedding_dim"])

    def forward(self, input_tensor):
        """
        Performs the forward pass through the Transformer block.

        Args:
            input_tensor (torch.Tensor): The input tensor of shape (B, T, E),
                where B is batch size, T is sequence length, and E is embedding dim.

        Returns:
            torch.Tensor: The output tensor with the same shape as the input.
        """
        # --- First Sub-layer: Multi-Head Attention with Add & Norm ---
        
        # Store the original input for the first residual connection.
        residual = input_tensor
        
        # Apply layer normalization, then the multi-head attention.
        x = self.multi_head(self.layer_norm_1(input_tensor))
        
        # Add the residual connection. This allows the model to bypass the
        # sub-layer if needed, aiding gradient flow.
        x = x + residual

        # --- Second Sub-layer: Feed-Forward Network with Add & Norm ---
        
        # Store the output of the first sub-layer for the second residual connection.
        residual = x
        
        # Apply layer normalization, then the feed-forward network.
        x = self.feed_forward(self.layer_norm_2(x))
        
        # Add the second residual connection.
        return x + residual

In [None]:
b = Block(config)
input = torch.rand(8, config["context_size"], config["embedding_dim"])
ouptut = b(input)

output.shape