In [1]:
import torch
import torch.functional as F

In [2]:
# Create a 3D tensor of shape (2, 3, 4) filled with random values from a normal distribution.
a = torch.rand(2,3,4)

In [4]:
# Perform element-wise multiplication between two tensors of shapes (3, 4) and (4) (broadcasting).

a = torch.rand(3,4) # this has shape [3,4]
b = torch.rand(4)   # this has shape [4]

# if we just do a * b , then b will be broadcasted to a
c = a * b 

In [6]:
# Reshape a tensor of shape (2, 3, 4) to (2, 12) and then to (6, 4).
a = torch.rand(2,3,4)

b = a.view(2,-1)
print("shape of b - ", b.shape)

c = b.view(6,-1)
print("shape of c - ", c.shape)

shape of b -  torch.Size([2, 12])
shape of c -  torch.Size([6, 4])


In [7]:
# Implement a function that performs batch matrix multiplication between tensors of shapes 
# (batch_size, seq_len, d_model) and (batch_size, d_model, seq_len).
batch_size = 3
seq_len = 4
d_model = 8

a = torch.rand(batch_size, seq_len, d_model)
b = torch.rand(batch_size, d_model, seq_len)
# NOTE - a batch multiplication bw above two means - that for every word in the sequence , 
# we get similarity score with every other word

c = torch.bmm(a,b)
c.shape

torch.Size([3, 4, 4])

In [10]:
# Create a masking function that generates a look-ahead mask for a sequence of length n 
# (upper triangular matrix with zeros on and below diagonal, ones above).
ones = torch.ones(5,5)
a = torch.triu(ones, diagonal=1)
a

tensor([[0., 1., 1., 1., 1.],
        [0., 0., 1., 1., 1.],
        [0., 0., 0., 1., 1.],
        [0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0.]])

In [15]:
# Implement a function that applies a mask to attention logits, 
# replacing masked values with a large negative number.

b = torch.rand(5,5)
c = b.masked_fill_(a==0, value=-1e9)
# note -- above i did a==0 , it is giving a boolean to the masked_fill_ to just know where to put the value
c

tensor([[-1.0000e+09,  7.2933e-01,  1.2759e-01,  3.1026e-01,  9.0654e-02],
        [-1.0000e+09, -1.0000e+09,  6.4935e-01,  9.2305e-01,  8.3734e-01],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09,  1.4614e-02,  9.3650e-01],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09,  5.9154e-01],
        [-1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09, -1.0000e+09]])

##### ✅ Even if the sequence length is different, the embedding size stays the same (e.g., 512).
 But still —
 ❓ Why do we need padding at all?

 In Transformers, we batch sequences together to compute attention efficiently using matrix operations.
 But matrix operations (like dot product across time steps) require all sequences in 
 the batch to have the same sequence length (same number of time steps).


```
Padded Input (after embeddings, assuming d_model=4):

Sentence 0: [e1, e2, e3, PAD]   → shape: (4, d_model)
Sentence 1: [e1, e2, e3, e4]    → shape: (4, d_model)
```


| Term                       | Description                                                                                                             |
| -------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
| `embedding size (d_model)` | The **dimensionality of word representations**. Fixed size (e.g., 512).                                                 |
| `sequence length`          | The **number of words/tokens** in a sentence. Varies (e.g., 6 words, 10 words...)                                       |
| `padding`                  | Dummy tokens (usually all zeros) added to shorter sequences to make their length equal to the longest one in the batch. |


In [None]:
# Implement a function that efficiently computes attention scores for sequences of 
# variable length within a batch.


# ✅ Step 1: Understand the scenario
# Say we have a batch of 2 sequences:
# Sequence A: "I love transformers" → length 3
# Sequence B: "Deep learning is awesome" → length 4
# To batch them together in PyTorch, we must pad them to the same length (max = 4).

import torch
import torch.nn.functional as F

# Simulate embedded sequences (e.g., from an embedding layer)
# We'll use random vectors to represent tokens

d_k = 4  # embedding size

# Sequence A: 3 tokens + 1 padding
seq1 = torch.rand(3, d_k)
padded_seq1 = F.pad(seq1, pad=(0, 0, 0, 1))  # pad 1 row on bottom

# Sequence B: 4 tokens (no padding needed)
padded_seq2 = torch.rand(4, d_k)

# Stack into a batch
x = torch.stack([padded_seq1, padded_seq2])  # Shape: (2, 4, 4)
print("Batch input shape:", x.shape)  # (batch_size, seq_len, d_k)

# Create padding mask: 1 for real tokens, 0 for padding
mask = torch.tensor([
    [1, 1, 1, 0],  # 1st sequence has padding in last position
    [1, 1, 1, 1],  # 2nd sequence has all real tokens
])



def masked_attention(q, k, v, mask):
    # dot product to get attention scores
    scores = torch.bmm(q, k.transpose(-1,-2))

    # step 2 - scale 
    d_k = q.size(-1)
    scores = scores / torch.sqrt(torch.tensor(d_k))

    # step 3 - expand mask and apply 
    # currently mask shape is 2,4 - we need it in 3d
    mask = mask.unsqueeze(1)
    scores = scores.masked_fill_(mask==0, float(-1e9))

    atten_weights = F.softmax(scores, dim=-1)

    output = torch.bmm(atten_weights, v)

    return output, atten_weights


# how it works


# [ I   love   transformers   <PAD> ]   → Mask = [1, 1, 1, 0]
# [ Deep learning is awesome     ]     → Mask = [1, 1, 1, 1]

# Attention scores:
# ✓ Real tokens attend to each other.
# ✗ PAD tokens are ignored (masked with -inf → softmax → 0).


Batch input shape: torch.Size([2, 4, 4])


In [None]:
# Write a function that applies padding masks and causal masks 
# simultaneously for transformer decoder training.


# 🧠 What are the two masks?
#   Padding mask:
#       Masks out the PAD tokens in the input.
#       Shape: (batch_size, seq_len) → 1 for real token, 0 for pad.
#       Usually comes from input data.
#   Causal mask (look-ahead mask):
#       Ensures that at position t, the model can’t see future tokens (t+1, t+2, ...).
#       Shape: (seq_len, seq_len) → Upper triangular matrix with -inf above the diagonal.

causal_mask = torch.tril(torch.ones(seq_len, seq_len), diagonal=0)

def attention_layer(q,k,v,pad_mask, causal_mask):
    # finding attention scores
    scores = torch.bmm(q, k.transpose(-1,-2))

    # scaling the scores
    scores = scores / torch.sqrt(torch.tensor(k.size(-1), dtype=torch.float32))

    # first we make their dimensions compatible
    pad_mask = pad_mask.unsqueeze(1)
    causal_mask = causal_mask.unsqueeze(0).bool()
    # Final mask: only allow attending to real tokens in the past
    final_mask = pad_mask & causal_mask
    # calculating the final combined mask 
    masked_scores = scores.masked_fill(final_mask==0, -1e9)

    attn_wts = torch.softmax(masked_scores, dim=-1)
    output = torch.bmm(attn_wts, v)

    return output, attn_wts 

# Exercise Set 2: Scaled Dot-Product Attention

In [None]:
# Implement scaled dot-product attention using PyTorch operations.

# note - we will be doing masked version
def scaled_dot_prod_attn(q, k, v, mask):
    scores = torch.bmm(q, k.transpose(-1, -2))

    # scaled scores
    scores = scores / torch.sqrt(torch.tensor(k.size(-1), dtype=torch.float32))

    masked_score = scores.masked_fill(mask==0, -1e9)

    attention_wts = torch.softmax(masked_score, dim=-1)

    # (Optional) Avoid NaNs from softmax over -inf only
    attention_wts = attention_wts.masked_fill(torch.isnan(attention_wts), 0)

    output = torch.bmm(attention_wts, v)

    return output, attention_wts

In [24]:
# Implement a batched version of scaled dot-product attention that handles 
# multiple examples simultaneously.
# Add support for attention masking to handle variable-length sequences.

batch_size = 2
seq_len = 4
d_k = 8

# Dummy queries, keys, values
q = torch.randn(batch_size, seq_len, d_k)
k = torch.randn(batch_size, seq_len, d_k)
v = torch.randn(batch_size, seq_len, d_k)

# Padding mask (for example: first sequence has length 3, second has length 2)
pad_mask = torch.tensor([
    [1,1,1,0],
    [1,1,0,0]
])
# its in the shape -- B, S - 2,4

pm = pad_mask.unsqueeze(1) # now its shape - 2,1,4
# note -- WE HAVE TO MASK THE ATTENTION SCORES SO THE MASK HAS TO BE IN SHAPE -- 2,4,4

# NOW IF WE DO 
final_mask = pad_mask.unsqueeze(1) & torch.ones(batch_size, seq_len, seq_len, dtype=bool) # -- one gives shape 2,1,4 and one gives shape 2,4,1
print("final mask size - ", final_mask.shape)


# Compute scores
scores = torch.bmm(q, k.transpose(-1, -2))  # (B, S, S)
scores = scores / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
print("scores shape - ", scores.shape)
# Masking
scores = scores.masked_fill(final_mask == 0, -1e9)


# Softmax
attn_weights = torch.softmax(scores, dim=-1)

# Attention output
output = torch.bmm(attn_weights, v)  # (B, S, D)

print("output shape - ", output.shape)

final mask size -  torch.Size([2, 4, 4])
scores shape -  torch.Size([2, 4, 4])
output shape -  torch.Size([2, 4, 8])


In [None]:
# Implement a memory-efficient version of attention that processes long sequences in chunks.


# ✅ Part 1: Chunked (Windowed) Attention
# Instead of computing attention over the entire sequence, 
# we divide the sequence into non-overlapping or overlapping chunks, 
# and apply attention within each chunk.

#   🔍 Why?
#       Reduces memory from O(N²) to O(N * W) where W is chunk/window size.
#       Example: instead of attending to 10,000 tokens at once, attend to 512 tokens at a time.



def chunked_attention(q, k, v, chunk_size):
    B, S, D = q.size()

    chunks = S // chunk_size

    output = []


    for i in range(chunks):
        start = i * chunk_size
        end = start + chunk_size

        # now we can easily take out the batches from the given sequences
        q_chunk = q[:, start:end, :]
        k_chunk = k[:, start:end, :]
        v_chunk = v[:, start:end, :]

        scores = torch.bmm(q_chunk, k_chunk.transpose(-1,-2)) / torch.sqrt(torch.tensor(k_chunk.size(-1), dtype=torch.float32))
        attention = torch.softmax(scores)
        output_chunk = torch.bmm(attention, v_chunk)

        output.append(output_chunk)

    return output

In [27]:
# Implement sparse attention patterns (e.g., local attention) that only attend to nearby positions.

# Only allow each token to attend to its W neighbors, e.g., a window of ±2 tokens.

def local_attention_mask(seq_len, window_size):
    """
    Returns a (seq_len, seq_len) boolean mask for local attention.
    1 means allowed, 0 means masked out.
    """
    mask = torch.zeros(seq_len, seq_len, dtype=torch.bool)
    for i in range(seq_len):
        start = max(0, i - window_size)
        end = min(seq_len, i + window_size + 1)
        mask[i, start:end] = 1
    return mask  # shape: (S, S)

# 🔧 Code: Local Attention with Mask


def local_attention(q, k, v, window_size):
    """
    Applies sparse local attention: each token attends only to nearby positions.
    """
    B, S, D = q.size()
    mask = local_attention_mask(S, window_size).to(q.device)  # (S, S)
    mask = mask.unsqueeze(0).expand(B, -1, -1)  # (B, S, S)

    # Scaled dot-product attention
    scores = torch.bmm(q, k.transpose(1, 2)) / torch.sqrt(torch.tensor(D, dtype=torch.float32))
    scores = scores.masked_fill(~mask, -1e9)  # Apply local mask
    attn = torch.softmax(scores, dim=-1)
    output = torch.bmm(attn, v)  # (B, S, D)

    return output
    

# Implement a basic multi-head attention module with 8 heads.

In [None]:
import torch 
import torch.nn as nn

# Implement a basic multi-head attention module with 8 heads.
b = 4
s = 5
d = 512
heads = 8

head_dim = d//heads
print(head_dim)

# what it has to do is create 3 vectors q,k,v from the given b,s,d and project them acc to the num of heads and their dimensions
# so the 512 will be projected into the heads , head dimensions

x = torch.rand(b, s, d) # this can be thought of as the input



class MultiHeadAttention(nn.module):
    def __init__(self, d_model = d, num_heads = heads):
        super().__init__()

        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dimensions = d_model // num_heads

        # linear layers to make q,k,v from the input
        self.q = nn.Linear(d_model, d_model)
        self.k = nn.Linear(d_model, d_model)
        self.v = nn.Linear(d_model, d_model)

        self.out_proj = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
        scores = torch.bmm(q, k.transpose(-1,-2))

        scores = scores / torch.sqrt(torch.tensor(q.size(-1), dtype=torch.float32))

        if mask is not None:
            scores = scores.masked_fill(mask==0, -1e9)
        
        attention_weights = F.softmax(scores, dim=-1)

        output = torch.bmm(attention_weights, v)
        return output, attention_weights

    def forward(self, x, mask=None):
        B, S, D = x.size()

        q = self.q(x)
        k = self.k(x)
        v = self.v(v)

        # VERY IMPORTANT STEP -- SPLIT INTO HEADS 
        def reshape_to_heads(x):
            return x.view(B, S, self.num_heads, self.head_dimensions).transpose(1,2)
            # it reshapes B,S,D ----> B,S,H,H_dim -----> B, H, S, H_dim
        
        q = reshape_to_heads(q)
        k = reshape_to_heads(k)
        v = reshape_to_heads(v)

        attn_output, attn_weights = self.scaled_dot_prod_attn(q , k , v, mask)

        # concat all the heads 
        # here we do ----   [B, h, S, head_dim] -> [B, S, h * head_dim] 
        concat = attn_output.transpose(1,2).contiguous().view(B, S, self.d_model)


        output = self.out_proj(concat)

        return output, attn_weights


64


### Whats happening above 


## ✅ How Scaled Dot-Product Attention works on `[B, h, S, head_dim]`

### Step-by-step breakdown:

Assume:

* `B = batch size`
* `h = number of heads`
* `S = sequence length`
* `head_dim = d_model / num_heads`

Now, `Q`, `K`, and `V` all have shape `[B, h, S, head_dim]`.

```python
scores = torch.matmul(q, k.transpose(-2, -1))
```

### 🤔 What’s happening here?

* `q` has shape `[B, h, S, head_dim]`
* `k.transpose(-2, -1)` flips the last two dims → `[B, h, head_dim, S]`

So this becomes:

```
[B, h, S, head_dim] @ [B, h, head_dim, S] → [B, h, S, S]
```

### 🎯 Meaning:

You’re computing dot products between each query and all keys **across the sequence** for every **head** in every **batch**. The result is a matrix of attention **scores** showing how much each word attends to every other word.

---

## 🤯 Visual example:

Say you have 1 head (h=1), 1 query for a sentence of 5 words (S=5), each with `head_dim=3`:

```
q = [word1_q, word2_q, word3_q, word4_q, word5_q] → shape [1, 1, 5, 3]
k = [word1_k, ..., word5_k] → shape [1, 1, 5, 3]
```

* You take dot products between each query vector and all key vectors: → attention map `[5, 5]` per head
* This is done **in parallel for all heads** using `torch.matmul`

---

## ✅ What is `.contiguous()`?

PyTorch stores tensors in memory in a way that can be **non-contiguous** after certain operations like `transpose`, `view`, or `permute`.

### 👇 For example:

```python
x = torch.randn(2, 3, 4)
x = x.transpose(1, 2)  # x now has shape (2, 4, 3), but it's not contiguous in memory
```

If you now try to `.view()` it (reshape), PyTorch may throw an error because it expects contiguous memory layout.

### 💡 Solution:

Use `.contiguous()` before `.view()`:

```python
x = x.contiguous().view(2, -1)
```

It tells PyTorch:

> “Please copy this tensor’s data into a fresh, contiguous memory layout so I can safely reshape it.”

---

### ✅ Summary:

| Concept               | Explanation                                                              |
| --------------------- | ------------------------------------------------------------------------ |
| `[B, h, S, head_dim]` | Processes multi-head attention **in parallel** for all heads and batches |
| `matmul(q, k.T)`      | Computes attention scores for every position against every other         |
| `.contiguous()`       | Fixes memory layout so `.view()` or other reshapes work safely           |

---


## 🧠 Implement a Multi-Head Attention Module with Different Dimensions for Keys, Queries, and Values


In [None]:
# 🧠 Implement a Multi-Head Attention Module with Different Dimensions for Keys, Queries, and Values

# 🎯 Goal:
# Build a Multi-Head Attention mechanism where:
#   query, key, and value have different input dimensions
# For example:
#   query comes from one source (e.g., decoder)
#   key/value come from another (e.g., encoder)
# This is especially needed in cross-attention.



# Normally, all 3 have the same dimension d_model, and we split into n_heads:
# q, k, v = x @ W_q, x @ W_k, x @ W_v  # Shape: (B, S, d_model)

# But what if:
# q = decoder_output  → shape: [B, T_q, d_q]
# k = encoder_output  → shape: [B, T_kv, d_k]
# v = encoder_output  → shape: [B, T_kv, d_v]


# You must:
#     Use separate projection layers for q, k, v (with their own input dims)
#     Still produce same head_dim to allow scaled dot-product attention

In [None]:
import torch
import torch.nn as nn 


class MultiHeadAttention(nn.Module):
    def __init__(self, d_q, d_k, d_v, d_model, n_heads):
        super().__init__()
        self.n_heads = n_heads
        self.d_model = d_model
        self.head_dim = d_model // n_heads
        assert self.head_dim * n_heads == d_model, "d_model must be divisible by n_heads"

        # Separate linear layers for q, k, v with their input dims
        self.q_proj = nn.Linear(d_q, d_model)
        self.k_proj = nn.Linear(d_k, d_model)
        self.v_proj = nn.Linear(d_k, d_model)

        # Output projection
        self.out_proj = nn.Linear(d_model, d_model)


In [None]:
# 🧠 2. Split into heads
def split_heads(self, x):
    # x: [B, T, d_model] → [B, n_heads, T, head_dim]
    B, T, _ = x.size()
    x = x.view(B, T, self.n_heads, self.head_dim)
    return x.transpose(1, 2)


In [None]:
def forward(self, q, k, v, mask=None):
    # Project to common d_model
    q = self.q_proj(q)  # [B, T_q, d_model]
    k = self.k_proj(k)  # [B, T_kv, d_model]
    v = self.v_proj(v)  # [B, T_kv, d_model]

    # Split into heads
    q = self.split_heads(q)  # [B, h, T_q, head_dim]
    k = self.split_heads(k)  # [B, h, T_kv, head_dim]
    v = self.split_heads(v)  # [B, h, T_kv, head_dim]

    # Scaled dot-product attention
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)  # [B, h, T_q, T_kv]
    if mask is not None:
        scores = scores.masked_fill(mask == 0, float('-inf'))
    attn = F.softmax(scores, dim=-1)

    output = torch.matmul(attn, v)  # [B, h, T_q, head_dim]

    # Concatenate heads
    output = output.transpose(1, 2).contiguous()  # [B, T_q, h, head_dim]
    output = output.view(output.size(0), output.size(1), self.d_model)  # [B, T_q, d_model]

    return self.out_proj(output)


## Positional encodings

✅ Implement sinusoidal positional encodings and visualize them.

🧠 Concept Recap (Super Simple)
Positional encoding gives each position in a sequence a unique vector based on sinusoids.

    - Each dimension of the vector has a different wavelength.

    - Even dims → sin(pos / (10000^(2i/d_model)))

    - Odd dims → cos(pos / (10000^(2i/d_model)))
    
This gives a smooth, periodic, and unique encoding per position, and importantly — it generalizes to longer sequences!

In [None]:
def get_pos_encodings(seq_len, d_model):
    pe = torch.arange(seq_len).unsqueeze(1)
    

# Layer norm and Residual connections

Great — here's a focused, step-by-step learning path that will make you master both **Layer Normalization** and **Residual Connections** without doing all those exercises individually. I’ll group the essential ideas, code, and visual mental models together so you get everything in fewer steps.

---

## ✅ Layer Normalization (LN) — Simplified Mastery Path

### 🔹 What is it?

It stabilizes training by normalizing each **input (per data point)** across its **features**, not across the batch (like batch norm). It's especially useful in Transformers where batch size can vary or inputs are padded.

### 🔹 Mental Model

Imagine your input is a row of test scores across subjects. LN makes sure the average is 0 and the spread is 1 — per student. So each row becomes normalized.

### 🔹 Minimal Concepts You Need

* LN normalizes across **last dimension** (i.e., features)
* BatchNorm normalizes across **batch dimension**
* LN uses **learnable scale (γ)** and **shift (β)** so the model can "undo" the normalization if needed

---

### ✅ Core Implementation (Covers Basic + Intermediate in one go)

```python
import torch
import torch.nn as nn

class MyLayerNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(dim))  # scale
        self.beta = nn.Parameter(torch.zeros(dim))  # shift
        self.eps = eps

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)         # mean over features
        std = x.std(dim=-1, keepdim=True)           # std over features
        norm_x = (x - mean) / (std + self.eps)       # normalize
        return self.gamma * norm_x + self.beta       # scale and shift

# Example usage
x = torch.randn(3, 4)  # 3 samples, 4 features
layer_norm = MyLayerNorm(4)
output = layer_norm(x)
```

✅ This gives you:

* Manual implementation
* Learnable parameters (γ and β)
* Epsilon control
* Works with any shape like (B, S, D)

### 🔹 🔍 Comparison

```python
nn.LayerNorm(4)(x)  # Should match MyLayerNorm(4)(x)
```

---

## ✅ Residual Connections — Simplified Mastery Path

### 🔹 What is it?

They **add the input back to the output** of a layer:

```
output = Layer(x)
x + output → next layer
```

### 🔹 Why?

* Prevent **vanishing gradients**
* Let the network **focus on learning what’s different**
* Help deep models train stably from the start

### 🔹 Mental Model

You're trying to improve a sentence. Instead of rewriting from scratch, you just **add corrections** to the existing one. Residual = "what I want to change."

---

### ✅ Core Implementation

```python
class ResidualBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
        self.norm = nn.LayerNorm(dim)

    def forward(self, x):
        return x + self.norm(self.layer(x))  # Residual + LayerNorm
```

This gives:

* Residual connection
* LayerNorm (post-normalization)
* Handles dimension match

---

## 🚀 Bonus: Visualization of Gradient Flow

```python
from torchviz import make_dot
x = torch.randn(1, 4, requires_grad=True)
model = ResidualBlock(4)
y = model(x)
make_dot(y, params=dict(list(model.named_parameters()) + [('x', x)]))
```

You'll see how the gradient flows through both the **main path** and the **shortcut**.

---

## 🔄 TL;DR Table

| Concept             | BatchNorm     | LayerNorm        |
| ------------------- | ------------- | ---------------- |
| Normalized over     | Batch dim (0) | Feature dim (-1) |
| Good for            | Vision (CNNs) | NLP/Transformers |
| γ, β                | Learnable     | Learnable        |
| Affected by padding | Yes (bad)     | No (good)        |

| Residual Connection                        |
| ------------------------------------------ |
| Adds `input + output` of a block           |
| Helps in stable training and gradient flow |
| Key in all Transformer layers              |

---

Would you like the same kind of guided path for the **Transformer Encoder block as a whole** (with all components linked)?
