# MULTI HEAD ATTENTION MATRIX OPERATIONS in DETAIL

## 1.Setup and Input Tensors
First, we define our inputs. In NLP, the input shape is typically $(B, S, N)$.

In [22]:
import numpy as np

#dimensions 
S=6
B=2
N=4
H=2
d_head = N//H


#create random input X
X =np.random.randn(B,S,N).round(2)

print(f"Input X: \n{X}\n and shape: {X.shape}\n")

Input X: 
[[[-0.78  0.19  1.48 -1.23]
  [ 1.96  0.47 -0.4   0.59]
  [-0.1  -0.76  0.97  1.61]
  [ 1.63 -0.93  3.04 -0.44]
  [-1.11  1.49 -0.39 -0.31]
  [-1.87  0.83 -0.9   0.39]]

 [[ 0.68 -1.06  0.47 -0.77]
  [ 0.64 -1.77 -0.7  -2.07]
  [-0.54 -0.03 -1.02 -0.56]
  [-1.35 -0.49 -0.42 -0.7 ]
  [-0.03  0.45 -0.36  0.96]
  [-0.23 -0.91  1.46  0.07]]]
 and shape: (2, 6, 4)



In [23]:
# so to write 2,6,4 it is liek this
D = np.array([
    [
        [0.1, 0.2, 0.3, 0.4],
        [0.1, 0.2, 0.3, 0.4],
        [0.1, 0.2, 0.3, 0.4],
        [0.1, 0.2, 0.3, 0.4],
        [0.1, 0.2, 0.3, 0.4],
        [0.1, 0.2, 0.3, 0.4]
    ],
    [
        [0.5, 0.6, 0.7, 0.8],
        [0.5, 0.6, 0.7, 0.8],
        [0.5, 0.6, 0.7, 0.8],
        [0.5, 0.6, 0.7, 0.8],
        [0.5, 0.6, 0.7, 0.8],
        [0.5, 0.6, 0.7, 0.8]
    ]
])

D.shape

(2, 6, 4)

### 2. Weight Matrices Initialization
In MHA, we project our input into Queries ($Q$), Keys ($K$), and Values ($V$). Each projection has its own weight matrix.

In [24]:
# Weights for Q,K,V
# shape: (In_features, out_features) -> (N, N)
W_q = np.random.randn(N, N).round(2)
W_k = np.random.randn(N, N).round(2)
W_v = np.random.randn(N, N).round(2)

print(f"W_q: \n{W_q}\n W_k: \n{W_k}\n W_v: \n{W_v}\n")
print(f"W_q shape: {W_q.shape}, W_k shape: {W_k.shape}, W_v shape: {W_v.shape}\n")

W_q: 
[[ 0.1   0.27 -0.35 -0.11]
 [-1.86 -0.25 -1.25 -0.36]
 [-0.71  1.71 -1.17  1.05]
 [ 0.08  1.76  0.53 -0.1 ]]
 W_k: 
[[ 0.7  -1.52  2.67  0.06]
 [-1.46  2.02 -0.78 -0.83]
 [ 1.39 -1.07 -0.52  1.11]
 [ 1.73  1.05  1.47  1.17]]
 W_v: 
[[-1.26  0.25  0.03  1.92]
 [ 0.58 -0.25 -0.46  1.69]
 [ 0.08 -1.85 -0.71  0.3 ]
 [ 1.64 -0.9   1.04  0.72]]

W_q shape: (4, 4), W_k shape: (4, 4), W_v shape: (4, 4)



### 3: Linear Projections (Q, K, V Calculation)

We multiply the input $X$ by the weights. Note that the multiplication happens on the last dimension.

```python
import numpy as np

# Q_total: shape (B=2, S=3, N=4)
# B=Batch, S=Sequence, N=Features
Q_total = np.array([
    [   # B=0
        [-1.5116,  1.5644,  2.1816,  3.3755],  # S=0
        [-2.6180,  2.2626,  1.1976,  1.1208],  # S=1
        [-3.0033,  0.1768,  0.5522, -0.1006]   # S=2
    ],
    [   # B=1
        [-1.7886,  3.2338,  0.2871, -0.4054],  # S=0
        [ 1.3256, -2.1606,  0.2825, -0.0161],  # S=1
        [ 0.7166, -0.7093, -0.7909, -1.1111]   # S=2
    ]
])

# W: shape (N=4, N=4)
W = np.array([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6]
])

# Multiply: (B,S,N) @ (N,N) → (B,S,N)
output = np.matmul(Q_total, W)

print("Output shape:", output.shape)
print(output)

```

**What happens internally:**
For **B=0, S=0**:

```python
# shape (4,)
[-1.5116, 1.5644, 2.1816, 3.3755] @ W
# = [
#   -1.5116*0.1 + 1.5644*0.5 + 2.1816*0.9 + 3.3755*1.3,
#   -1.5116*0.2 + 1.5644*0.6 + 2.1816*1.0 + 3.3755*1.4,
#   -1.5116*0.3 + 1.5644*0.7 + 2.1816*1.1 + 3.3755*1.5,
#   -1.5116*0.4 + 1.5644*0.8 + 2.1816*1.2 + 3.3755*1.6
# ]

```

For **B=1, S=0**:

```python
[-1.7886,  3.2338,  0.2871, -0.4054] @ W → (4,)

```

…and similarly for all **B** and **S**.

#### ✅ Output

**Shape:** `(2, 3, 4)` — same as input except last dimension transformed by W.

Visual 3D array structure:

```text
[
  [ [ ...4 elements... ], [ ...4 elements... ], [ ...4 elements... ] ],   # B=0
  [ [ ...4 elements... ], [ ...4 elements... ], [ ...4 elements... ] ]    # B=1
]

```


In [25]:
# Matrix:multiply: (S,B,N) @ (N,N) -> (S,B,N)

Q_total = np.matmul(X,W_q)
K_total = np.matmul(X,W_k)
V_total = np.matmul(X,W_v)

print(f"Q_total: \n{Q_total}\n and shape: {Q_total.shape}\n")


Q_total: 
[[[-1.5806  0.1079 -2.348   1.6944]
  [-0.347   0.7661 -0.4928 -0.8638]
  [ 0.8437  4.6553  0.7034  1.1421]
  [-0.3008  5.0966 -3.198   3.3915]
  [-2.6303 -1.8847 -1.182  -0.7928]
  [-1.0606 -1.565   0.8767 -1.0771]]

 [[ 1.6443 -0.1029  0.129   0.8773]
  [ 3.6876 -4.2249  1.7104  0.0388]
  [ 0.6812 -2.8681  1.1231 -0.9448]
  [ 1.0186 -2.1922  1.2054 -0.0461]
  [-0.5076  0.9534  0.378  -0.6327]
  [ 0.6386  2.7852 -0.4531  1.8789]]]
 and shape: (2, 6, 4)



To adapt the layout to **(Batch, Sequence, Features)** or `(B, S, N)`, we reorganize the indices so that each primary group represents a full batch item rather than a sequence step.

### NumPy Matrix Multiplication with 3D Array (Batch + Sequence)

```python
import numpy as np

# Q_total: shape (B=2, S=3, N=4)
Q_total = np.array([
    [   # B=0
        [-1.5116,  1.5644,  2.1816,  3.3755],  # S=0
        [-2.6180,  2.2626,  1.1976,  1.1208],  # S=1
        [-3.0033,  0.1768,  0.5522, -0.1006]   # S=2
    ],
    [   # B=1
        [-1.7886,  3.2338,  0.2871, -0.4054],  # S=0
        [ 1.3256, -2.1606,  0.2825, -0.0161],  # S=1
        [ 0.7166, -0.7093, -0.7909, -1.1111]   # S=2
    ]
])

# W: shape (N=4, N=4)
W = np.array([
    [0.1, 0.2, 0.3, 0.4],
    [0.5, 0.6, 0.7, 0.8],
    [0.9, 1.0, 1.1, 1.2],
    [1.3, 1.4, 1.5, 1.6]
])

# Matrix multiplication: (B,S,N) @ (N,N) → (B,S,N)
output = np.matmul(Q_total, W)

print("Output shape:", output.shape)
print(output)
```

#### What happens internally?

For each **batch item (B)** and each **sequence position (S)**, NumPy performs a vector-matrix multiplication:

```python
# Example: B=0, S=0   shape (4,) @ (4,4) → (4,)
[-1.5116, 1.5644, 2.1816, 3.3755] @ W

# Which computes:
[
  -1.5116*0.1 + 1.5644*0.5 + 2.1816*0.9 + 3.3755*1.3,
  -1.5116*0.2 + 1.5644*0.6 + 2.1816*1.0 + 3.3755*1.4,
  -1.5116*0.3 + 1.5644*0.7 + 2.1816*1.1 + 3.3755*1.5,
  -1.5116*0.4 + 1.5644*0.8 + 2.1816*1.2 + 3.3755*1.6
]
```

The same operation is applied independently to **every** combination of B and S.

#### Result

* **Input shape**: `(2, 3, 4)` → B=2 (batch), S=3 (sequence), N=4 (features)
* **Output shape**: `(2, 3, 4)` — same shape, last dimension transformed by W

#### Visual structure of the 3D array

```text
[
  [ [ ...N=4... ], [ ...N=4... ], [ ...N=4... ] ],   # ← B = 0
  [ [ ...N=4... ], [ ...N=4... ], [ ...N=4... ] ]    # ← B = 1
]
      ↑                ↑                ↑
    Seq 0            Seq 1            Seq 2

```

Perfect for transformer-style operations where you apply the same weight matrix to every token in every batch item!

```

```

### 4.Splitting Heads (The B, S, N way)
This is where it gets interesting. To keep things clean, we want the heads to be in a dimension where they don't interfere with the Sequence length.

In [26]:
def split_heads(tensor, B, S, H, d_head):
    # 1. Reshape: (B, S, N) -> (B, S, H, d_head)
    tensor = tensor.reshape(B, S, H, d_head)
    # 2. Transpose: (B, S, H, d_head) -> (B, H, S, d_head)
    return tensor.transpose(0, 2, 1, 3)

Q = split_heads(Q_total, B, S, H, d_head)
K = split_heads(K_total, B, S, H, d_head)
V = split_heads(V_total, B, S, H, d_head)

print(f"Shape after head split: {Q.shape}") 
# Result: (2, 2, 6, 2) -> (Batch, Heads, Seq_Len, Head_Dim)

Shape after head split: (2, 2, 6, 2)


In [27]:
# dummy Q is
Q_d = np.array([
    [ # first batch
        [ #first head
            [1,1], [1,1], [1,1], [1,1], [1,1], [1,1] #seq_len=6 and each seq has 2 values of dim=2
        ],
        [# for second head 
            [2,2], [2,2], [2,2], [2,2], [2,2], [2,2] #seq len=6
        ]
    ],
    [# second batch
        [ # first head
            [3,3], [3,3], [3,3], [3,3], [3,3], [3,3] #seq_len=6
        ],
        [ # second head
            [4,4], [4,4], [4,4], [4,4], [4,4], [4,4]#seq_len=6
        ]
    ]
])

Q_d.shape


(2, 2, 6, 2)

### 5: Scaled Dot-Product AttentionWe 
calculate the similarity between $Q$ and $K$.Formula: $Attention(Q, K, V) = \text{softmax}(\frac{QK^T}{\sqrt{d_k}})V$

 ##### 5.1: The Raw Scores (Dot Product)

In this step, we calculate the similarity between every word's Query and every word's Key.

**Math:** $Q \cdot K^T$  

**Shape:** $(B, H, S, d_{head}) \times (B, H, d_{head}, S) \rightarrow (B, H, S, S)$


more explanation on 
 Why Transpose `(B, S, H, d_{head}) → (B, H, S, d_{head})` and Why K is Transposed

1. **After splitting heads (paper convention)**:

- Suppose the input embedding is `(B, S, N)` with `N = H * d_{head}`.  
- After splitting into heads:  
  $$
  (B, S, N) \rightarrow (B, S, H, d_{head})
  $$  
  This is the **paper convention**, where:  
  - `B` = batch size  
  - `S` = sequence length  
  - `H` = number of heads  
  - `d_{head}` = dimension per head  

2. **Why transpose to `(B, H, S, d_{head})`?**

- Attention is **computed independently for each head**, so we move `H` upfront to make operations **vectorized across heads**.  
- This makes it easy to perform batch matrix multiplication for all heads simultaneously:

  $$
  Q: (B, H, S, d_{head}), \quad K: (B, H, S, d_{head})
  $$

3. **Why do we transpose K to get Kᵀ?**

- The attention score is a **dot product between Q and K for every token pair**. For a single head:  

  - Q has shape `(S, d_head)`  
  - K has shape `(S, d_head)`  
  - To get **all pairwise scores** between tokens, we do:  
    $$
    Q \cdot K^T \quad \text{→ shape: } (S, S)
    $$

- In the batched, multi-head form:

  $$
  Q: (B, H, S, d_{head}), \quad K^T: (B, H, d_{head}, S) \\
  Q \cdot K^T \rightarrow (B, H, S, S)
  $$

- Transposing K along the last two dimensions is necessary because **matrix multiplication in attention sums over the embedding dimension** (`d_head`) and produces a score for **each pair of tokens**.

---

**Summary:**

| Step | Shape | Reason |
|------|-------|--------|
| Split heads | `(B, S, H, d_head)` | Paper convention, separate heads |
| Transpose to `(B, H, S, d_head)` | `(B, H, S, d_head)` | Put heads upfront for vectorized attention computation |
| Compute Q · Kᵀ | `(B, H, S, S)` | Each head computes **pairwise attention scores** over sequence positions; transpose K so dot product sums over d_head |

**Key Idea:**  
- `(B, S, H, d_head)` → `(B, H, S, d_head)` makes computation **batch-friendly and head-friendly**.  
- Kᵀ is needed so the **dot product sums over the head dimension** to produce scores for every token pair.

In [28]:
# 1. We need to flip the last two dimension of K to do the dot product 
K_T = K.transpose(0,1,3,2)  # (B, H, d_head, S)

# matrix multiplication
# resulting shape : (2,2,6,6) -> (Batch, Heads, Seq_Len, Seq_Len)
scores = np.matmul(Q, K_T)

print(f"Scores shape: {scores.shape}\n Scores: \n{scores}\n")
print("Raw Score Matrix (Batch 0, Head 0):\n", scores[0, 0].round(2))


Scores shape: (2, 2, 6, 6)
 Scores: 
[[[[ 1.27232943e+00 -1.92447047e+00 -8.25559166e+00 -1.02962948e+01
     6.88779400e+00  5.53096911e+00]
   [-6.90044070e-01 -1.15176353e+00 -2.35481306e+00 -8.25242350e+00
     5.06738728e+00  5.58817185e+00]
   [-6.83277738e+00 -3.60222434e+00  9.63466660e-01 -3.25417745e+01
     1.88925147e+01  2.48136767e+01]
   [-6.38568534e+00 -5.35246058e+00 -5.27967452e+00 -4.29283892e+01
     2.56190627e+01  3.09582266e+01]
   [ 4.81260402e+00 -1.17481934e+00 -1.22301061e+01 -4.73591260e-01
     1.57676188e+00 -2.95740792e+00]
   [ 2.99170296e+00  3.17079200e-01 -4.34330692e+00  6.30654508e+00
    -3.21940552e+00 -5.93541324e+00]]

  [[ 1.12890025e+01 -1.39959745e+01  9.37104080e-01 -3.72639280e-01
     6.72504624e+00  8.52151728e+00]
   [ 2.37031984e+00 -2.90553676e+00 -4.17519144e+00 -4.62580006e+00
     3.97090246e+00  3.42716164e+00]
   [-3.38321258e+00  4.14960944e+00  5.63376646e+00  6.26385319e+00
    -5.47720161e+00 -4.76969058e+00]
   [ 1.53748698e

##### 5.2 Scaling for Gradient Stability

We divide by $\sqrt{d_{head}}$ ($d_k$ in the paper) to prevent attention scores from becoming too large, which would cause the softmax gradients to vanish.

**Proper explanation:**

1. **Dot product grows with vector length**

   Each attention score is computed as the dot product between a Query and a Key vector:

   $$
   Q \cdot K = \sum_{i=1}^{d_{head}} Q_i K_i
   $$

   - If the elements $Q_i$ and $K_i$ are independent with **mean 0** and **variance $\sigma^2$**, the **expected squared magnitude** of the dot product is proportional to $d_{head}$:

   $$
   \text{Var}(Q \cdot K) = \sum_{i=1}^{d_{head}} \text{Var}(Q_i K_i) \approx d_{head} \cdot \sigma^2
   $$

   - This shows that **as $d_{head}$ increases, the magnitude of the raw scores grows**.  

2. **Effect on softmax**

   The softmax function is sensitive to the scale of its inputs:

   $$
   \text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}}
   $$

   - Large values in $x_i$ → softmax becomes **very peaked** (one value ≈ 1, others ≈ 0).  
   - Small gradients → **vanishing gradients** during backpropagation.  

3. **Scaling by \(\sqrt{d_{head}}\)**

   To normalize the scale of the dot product, we divide by $\sqrt{d_{head}}$:

   $$
   \text{Scaled Score} = \frac{Q \cdot K}{\sqrt{d_{head}}}
   $$

   - Why $\sqrt{d_{head}}$ and not $d_{head}$?  
     - Dividing by $d_{head}$ would over-shrink the scores. Softmax would become **too flat**, making the model insensitive to differences in attention.  
     - Dividing by $\sqrt{d_{head}}$ **normalizes the variance** of the scores to be roughly independent of $d_{head}$:

       $$
       \text{Var}\Big(\frac{Q \cdot K}{\sqrt{d_{head}}}\Big) \approx \sigma^2
       $$

   - Why not some other factor?  
     - Any other factor would either under- or over-scale the scores.  
     - $\sqrt{d_{head}}$ is the **mathematically correct factor** derived from the fact that a sum of $d_{head}$ independent variables has variance proportional to $d_{head}$.  

**Takeaway:**

Scaling by $\sqrt{d_{head}}$ ensures that:

- Attention scores remain in a **reasonable range**, regardless of head size.  
- Softmax outputs are **neither too sharp nor too flat**.  
- Gradients during backpropagation are **stable**, which is critical for training deep transformers.


In [29]:
# d_head is 2 in our example
scaling_factor = np.sqrt(d_head)
scaled_scores = scores / scaling_factor

print(f"Scaled Scores (Batch 0, Head 0):\n", scaled_scores[0, 0].round(2))

Scaled Scores (Batch 0, Head 0):
 [[  0.9   -1.36  -5.84  -7.28   4.87   3.91]
 [ -0.49  -0.81  -1.67  -5.84   3.58   3.95]
 [ -4.83  -2.55   0.68 -23.01  13.36  17.55]
 [ -4.52  -3.78  -3.73 -30.35  18.12  21.89]
 [  3.4   -0.83  -8.65  -0.33   1.11  -2.09]
 [  2.12   0.22  -3.07   4.46  -2.28  -4.2 ]]


##### 5.3: Softmax – Converting Raw Scores into Attention Weights
This converts the raw scores into probabilities. Each row in the $6 \times 6$ matrix will now sum to 1. This tells the model: "For word $i$, how much percentage of attention should I give to word $j$?"


#### more explanation:

After computing the **raw attention scores**:

$$
\text{Scaled Scores} = \frac{Q \cdot K^T}{\sqrt{d_{head}}} \quad \text{(shape: } B, H, S, S \text{)}
$$

we apply the **softmax function** to convert them into probabilities, which become the **attention weights**.

---

**Softmax formula (for each row vector):**

$$
\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}}
$$

- Here, $x_i$ represents the raw score of a single query token attending to all key tokens.  
- After softmax, **each row sums to 1**, representing a valid probability distribution across tokens.  
- This answers the question: “For word $i$, how much attention should I give to word $j$?”

---

**Why subtract the max value for numerical stability?**

In code:

```python
e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
```
**Problem:** Exponentials grow very fast. Large numbers in \(x\) (e.g., 1000) can cause **overflow** in `np.exp(x)`.

**Solution:** Subtract the maximum value of each row before exponentiating. This ensures the largest value in the row becomes 0, keeping exponentials within a **safe numerical range**.

**Mathematically, this does not change the softmax result:**

$$
\text{softmax}(x_i) = \frac{e^{x_i}}{\sum_j e^{x_j}} 
= \frac{e^{x_i - \max(x)}}{\sum_j e^{x_j - \max(x)}}
$$


In [30]:
def softmax(x):
    # subtracting max for numerical stabiltiy (prevents overflow)
    e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return e_x/e_x.sum(axis=-1, keepdims=True)

weights = softmax(scaled_scores)

print(f"Weights Shape: {weights.shape}")
print("Attention Weights (Row sum should be 1.0):\n", weights[0, 0].round(2))
print("\nSum of first row:", np.sum(weights[0, 0, 0]))

Weights Shape: (2, 2, 6, 6)
Attention Weights (Row sum should be 1.0):
 [[0.01 0.   0.   0.   0.71 0.27]
 [0.01 0.   0.   0.   0.4  0.58]
 [0.   0.   0.   0.   0.01 0.99]
 [0.   0.   0.   0.   0.02 0.98]
 [0.87 0.01 0.   0.02 0.09 0.  ]
 [0.09 0.01 0.   0.9  0.   0.  ]]

Sum of first row: 1.0000000000000002


#### 5.2 Applying Weights to Values
Now we use those probabilities to take a weighted sum of the Value vectors ($V$).Math: $Weights \cdot V$Shape: $(B, H, S, S) \times (B, H, S, d_{head}) \rightarrow (B, H, S, d_{head})$

In [34]:
# this creates the 'context vectors'
# (2,2,6,6) @ (2,2,6,2) -> (2,2,6,2)
attention_output = np.matmul(weights, V)
print(f"Attention Output shpae: {attention_output.shape}")
print("Context Vectors:\n", attention_output)
print("Context Vector for Batch 0, Head 0, Word 0:\n", attention_output[0, 0, 0])

Attention Output shpae: (2, 2, 6, 2)
Context Vectors:
 [[[[ 2.14402292  0.39960117]
   [ 2.67248838  0.49704621]
   [ 3.3800271   0.63468213]
   [ 3.36749812  0.6325333 ]
   [-0.61925828 -1.69258463]
   [-2.84382125 -4.2797494 ]]

  [[-2.01801767 -1.62729909]
   [-0.56197836 -0.9495289 ]
   [-0.60145552  1.72734375]
   [-2.36034517 -1.62563163]
   [-0.61218319 -1.06819435]
   [ 0.70400426  4.73762408]]]


 [[[-0.25085677 -2.23087072]
   [-1.00781468 -1.51982567]
   [-4.76704565  3.07631155]
   [-3.52532198  1.49825583]
   [ 0.43357436  1.2195325 ]
   [ 1.7634395  -0.22815464]]

  [[-0.27479377 -1.13175258]
   [-0.01637377 -0.4856992 ]
   [-0.28047406 -1.7318373 ]
   [-0.06779337 -0.73057099]
   [-0.2201747  -1.86373055]
   [-0.49290634 -1.46183568]]]]
Context Vector for Batch 0, Head 0, Word 0:
 [2.14402292 0.39960117]


In [32]:
print(f"Initial Q shape (per head): {Q.shape}")
print(f"Final Output shape (per head): {attention_output.shape}")
print("\nNotice how the S (6) and d_head (2) dimensions are perfectly preserved.")

Initial Q shape (per head): (2, 2, 6, 2)
Final Output shape (per head): (2, 2, 6, 2)

Notice how the S (6) and d_head (2) dimensions are perfectly preserved.


#### 5.6Concatenation and Final Projection
Finally, we merge the heads back together. We want to return to the original shape $(B, S, N)$.

In [37]:
# 1. Tranpose: (B, H, S, d_head) -> (B, S, H, d_head)
# we move the sequence dimension back to second position so heads are side-by-side for each word
out_tranposed = attention_output.transpose(0, 2, 1, 3)  # (B, S, H, d_head)
print(f"Shape after transpose: {out_tranposed.shape}") # we get (2,6,2,2)

# 2. reshape: (B, S, H, d_head) -> (B, S, H*d_head) = (B, S, N)
out_concat = out_tranposed.reshape(B, S, N)

print(f"Final Output after concatenation shape: {out_concat.shape}")
print("\nConcatenated vector for Batch 0, Word 0:\n", out_concat[0, 0])
print("\n full concatenated output:\n", out_concat)

Shape after transpose: (2, 6, 2, 2)
Final Output after concatenation shape: (2, 6, 4)

Concatenated vector for Batch 0, Word 0:
 [ 2.14402292  0.39960117 -2.01801767 -1.62729909]

 full concatenated output:
 [[[ 2.14402292  0.39960117 -2.01801767 -1.62729909]
  [ 2.67248838  0.49704621 -0.56197836 -0.9495289 ]
  [ 3.3800271   0.63468213 -0.60145552  1.72734375]
  [ 3.36749812  0.6325333  -2.36034517 -1.62563163]
  [-0.61925828 -1.69258463 -0.61218319 -1.06819435]
  [-2.84382125 -4.2797494   0.70400426  4.73762408]]

 [[-0.25085677 -2.23087072 -0.27479377 -1.13175258]
  [-1.00781468 -1.51982567 -0.01637377 -0.4856992 ]
  [-4.76704565  3.07631155 -0.28047406 -1.7318373 ]
  [-3.52532198  1.49825583 -0.06779337 -0.73057099]
  [ 0.43357436  1.2195325  -0.2201747  -1.86373055]
  [ 1.7634395  -0.22815464 -0.49290634 -1.46183568]]]


###  Summary of Matrix Transformations

| Step        | Operation              | Resulting Shape / Shape Evolution           | Logic / Explanation                          |
|------------|------------------------|--------------------------------------------|---------------------------------------------|
| Input       | X                     | (2, 6, 4)                                  | Original input embeddings                    |
| Project     | X W_q                 | (2, 6, 4)                                  | Linear projection to Queries (similarly K, V) |
| Split       | Reshape & Transpose   | (2, 2, 6, 2)                               | Split into heads and prepare for attention  |
| Attention   | Q @ Kᵀ                | (2, 2, 6, 6)                               | Word-to-word similarity map                  |
| Scaling     | / √d_head             | (2, 2, 6, 6)                               | Keep values small                            |
| Softmax     | P(x)                  | (2, 2, 6, 6)                               | Normalized attention weights                 |
| Context     | Scores ⋅ V            | (2, 2, 6, 2)                               | The "contextualized" word vectors           |
| Merge       | Transpose & Reshape   | (2, 6, 4)                                  | Combine heads back into original embedding size |
