In [31]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [32]:
## Implemented using the Attention notebooks in the Attention directory

def selfAttention(input_embeddings, W_q, W_k, W_v, W_o):
    n = input_embeddings.shape[0]
    d_model = input_embeddings.shape[1]
    d_k = W_q.shape[1]

    Q = torch.matmul(input_embeddings, W_q)
    K = torch.matmul(input_embeddings, W_k)
    V = torch.matmul(input_embeddings, W_v)
    dropout = nn.Dropout(p=0.2)

    mask  = torch.tril(torch.ones(n, n))

    attention_scores = torch.matmul(Q, K.T)
    masked_attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    masked_attention_scores /= torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    attention_weights = F.softmax(masked_attention_scores, dim=-1)
    attention_weights = dropout(attention_weights)
    output = torch.matmul(attention_weights, V)
    final_output = torch.matmul(output, W_o)
    final_output = dropout(final_output)

    return final_output

In [33]:
## Implemented using the LayerNorm notebook in the LayerNorm directory
def residualPlusLayerNorm(attention_output, input_embeddings, gamma, beta, eps = 1e-5,):
    residual_output = attention_output + input_embeddings

    means = torch.mean(residual_output, dim=-1, keepdim=True) # Shape (n, 1)
    variances = torch.var(residual_output, dim=-1, keepdim=True, unbiased=False) # Shape (n, 1)
    normalized = (residual_output - means) / torch.sqrt(variances + eps) # Shape (n, d)

    ln_output = normalized * gamma + beta
    return ln_output

In [34]:
sentence = "The quick brown fox jumps over the lazy dog"

## Simple tokenization by splitting on spaces, ideally more complex tokenization would be used like BPE or WordPiece
sentence = sentence.split()
n = len(sentence)

print(f"Tokenized sentence: {sentence}")
print(f"Number of tokens: {len(sentence)}")

Tokenized sentence: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9


In [35]:
## Sample word embeddings, ideally these would be learned in the language modelling process or loaded from a pre-trained model like GloVe or Word2Vec

# Shape of embeddings: (n, d) where n is number of tokens and d is embedding dimension
embeddings = torch.tensor([
        [1.0, 0.5, 0.2, 0.8], 
        [0.3, 1.0, 0.7, 0.1],  
        [0.6, 0.2, 1.0, 0.4],  
        [0.9, 0.8, 0.3, 1.0],  
        [0.4, 0.6, 0.8, 0.2],  
        [0.7, 0.3, 0.5, 0.9],  
        [1.0, 0.5, 0.2, 0.8],  
        [0.2, 0.9, 0.4, 0.6],  
        [0.8, 0.4, 0.9, 0.3]  
    ])

print("Word embeddings (4-dimensional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {embeddings[i]}")
print()

Word embeddings (4-dimensional):
  The : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  quick: tensor([0.3000, 1.0000, 0.7000, 0.1000])
  brown: tensor([0.6000, 0.2000, 1.0000, 0.4000])
  fox : tensor([0.9000, 0.8000, 0.3000, 1.0000])
  jumps: tensor([0.4000, 0.6000, 0.8000, 0.2000])
  over: tensor([0.7000, 0.3000, 0.5000, 0.9000])
  the : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  lazy: tensor([0.2000, 0.9000, 0.4000, 0.6000])
  dog : tensor([0.8000, 0.4000, 0.9000, 0.3000])



In [36]:
## Sample positional encodings, typically these would be generated using math functions or learned during training or RoPE

positional_embeddings = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],  
    [0.1, 0.9, 0.1, 0.9],  
    [0.2, 0.8, 0.2, 0.8],  
    [0.3, 0.7, 0.3, 0.7],  
    [0.4, 0.6, 0.4, 0.6],  
    [0.5, 0.5, 0.5, 0.5],  
    [0.6, 0.4, 0.6, 0.4],  
    [0.7, 0.3, 0.7, 0.3],  
    [0.8, 0.2, 0.8, 0.2]   
])

print("Positional embeddings (Same dimesnions as word embeddings):")
for i, word in enumerate(sentence):
    print(f"  Pos {i} ({word}): {positional_embeddings[i]}")
print()

Positional embeddings (Same dimesnions as word embeddings):
  Pos 0 (The): tensor([0., 1., 0., 1.])
  Pos 1 (quick): tensor([0.1000, 0.9000, 0.1000, 0.9000])
  Pos 2 (brown): tensor([0.2000, 0.8000, 0.2000, 0.8000])
  Pos 3 (fox): tensor([0.3000, 0.7000, 0.3000, 0.7000])
  Pos 4 (jumps): tensor([0.4000, 0.6000, 0.4000, 0.6000])
  Pos 5 (over): tensor([0.5000, 0.5000, 0.5000, 0.5000])
  Pos 6 (the): tensor([0.6000, 0.4000, 0.6000, 0.4000])
  Pos 7 (lazy): tensor([0.7000, 0.3000, 0.7000, 0.3000])
  Pos 8 (dog): tensor([0.8000, 0.2000, 0.8000, 0.2000])



In [37]:
## The final input to the Attention block is the sum of the word embeddings and positional encodings
dropout = nn.Dropout(p=0.2)
input_embeddings = embeddings + positional_embeddings
input_embeddings = dropout(input_embeddings)

print("Input embeddings (word + positional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {input_embeddings[i]}")
print()

Input embeddings (word + positional):
  The : tensor([1.2500, 1.8750, 0.0000, 2.2500])
  quick: tensor([0.5000, 2.3750, 0.0000, 1.2500])
  brown: tensor([1.0000, 1.2500, 1.5000, 1.5000])
  fox : tensor([1.5000, 0.0000, 0.0000, 2.1250])
  jumps: tensor([1.0000, 0.0000, 1.5000, 1.0000])
  over: tensor([0.0000, 1.0000, 0.0000, 1.7500])
  the : tensor([2.0000, 1.1250, 1.0000, 1.5000])
  lazy: tensor([0.0000, 1.5000, 1.3750, 1.1250])
  dog : tensor([0.0000, 0.0000, 2.1250, 0.0000])



In [38]:
d_model = input_embeddings.shape[1]  # Embedding dimension
d_k = 3 # Dimension of keys and queries (generally kept smaller to make Q, K and V matrices low rank for efficiency)

In [39]:
torch.manual_seed(42)  # For reproducible results

## Shape of the Q, K and V matrices is d x d_k and for the output projection matrix is d_k x d to project the attention output back to d dimensions
W_q = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_k = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_v = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_o = torch.randn(d_k, d_model, dtype=torch.float32) * 0.3 

print(f"W_q (Query weights) shape: {W_q.shape}")
print(W_q)
print(f"\nW_k (Key weights) shape: {W_k.shape}")
print(W_k)
print(f"\nW_v (Value weights) shape: {W_v.shape}")
print(W_v)
print()

print(f"W_o (Output projection weights) shape: {W_o.shape}")
print(W_o)

W_q (Query weights) shape: torch.Size([4, 3])
tensor([[ 0.1010,  0.0386,  0.0703],
        [ 0.0691, -0.3369, -0.0559],
        [ 0.6625, -0.1914,  0.1385],
        [ 0.0802,  0.1605,  0.2428]])

W_k (Key weights) shape: torch.Size([4, 3])
tensor([[ 0.3331, -0.5069, -0.2967],
        [ 0.2874,  0.3966,  0.2452],
        [-0.2298, -0.2252,  0.4058],
        [ 0.2059, -0.0983,  0.2385]])

W_v (Value weights) shape: torch.Size([4, 3])
tensor([[ 0.0845,  0.0168,  0.1568],
        [-0.0715, -0.0150,  0.1579],
        [-0.0025,  0.2187,  0.0399],
        [ 0.2592, -0.3047, -0.2666]])

W_o (Output projection weights) shape: torch.Size([3, 4])
tensor([[ 0.0449, -0.0627, -0.1161,  0.2974],
        [ 0.1404, -0.0615, -0.2223,  0.1086],
        [ 0.5760, -0.0676, -0.1025,  0.0912]])


In [40]:
attention_output = selfAttention(input_embeddings, W_q, W_k, W_v, W_o)

print("\nAttention output shape:", attention_output.shape)
print(attention_output)


Attention output shape: torch.Size([9, 4])
tensor([[-0.2100,  0.0000,  0.0000,  0.1249],
        [-0.1051,  0.0165,  0.0000,  0.0878],
        [-0.0847,  0.0023,  0.0583,  0.0979],
        [-0.1094,  0.0071,  0.0883,  0.1120],
        [-0.0686, -0.0039,  0.0128,  0.0782],
        [-0.1572,  0.0067,  0.0631,  0.1061],
        [-0.0728, -0.0029,  0.0369,  0.1038],
        [-0.0702, -0.0043,  0.0421,  0.1188],
        [-0.0000,  0.0010,  0.0579,  0.0000]])


In [41]:
## The parameters for the layer norm layer that will be applied after the attention layer
gamma1 = torch.ones(d_model, dtype=torch.float32)
beta1 = torch.zeros(d_model, dtype=torch.float32)

## The parameters for the layer norm layer that will be applied after the feed forward layer
gamma2 = torch.ones(d_model, dtype=torch.float32)
beta2 = torch.zeros(d_model, dtype=torch.float32)

In [42]:
layernorm_output = residualPlusLayerNorm(attention_output, input_embeddings, gamma1, beta1)

print("\nLayerNorm output shape:", layernorm_output.shape)
print(layernorm_output)


LayerNorm output shape: torch.Size([9, 4])
tensor([[-0.3138,  0.6138, -1.4690,  1.1691],
        [-0.6887,  1.4729, -1.1162,  0.3321],
        [-1.5128, -0.2863,  0.8275,  0.9716],
        [ 0.4929, -0.9902, -0.9031,  1.4003],
        [ 0.0936, -1.5974,  1.1447,  0.3591],
        [-1.0595,  0.3924, -0.7847,  1.4518],
        [ 1.3911, -0.8280, -1.0627,  0.4997],
        [-1.7142,  0.7445,  0.6209,  0.3488],
        [-0.5777, -0.5767,  1.7320, -0.5777]])


In [43]:
hidden_dim = 16  # Dimension of the hidden layer in the feed-forward network which is typically 4 times d_model

W_ff1 = torch.randn(d_model, hidden_dim) * 0.3    # Shape (d, hidden_dim) to expand from nxd_model to nxhidden_dim
b_ff1 = torch.randn(hidden_dim) * 0.1 # Shape (hidden_dim,) bias for the first feed forward layer where it is broadcasted across the n tokens and added dimension wise
W_ff2 = torch.randn(hidden_dim, d_model) * 0.3    # Shape (hidden_dim, d) to project back from nxhidden_dim to nxd_model
b_ff2 = torch.randn(d_model) * 0.1 # Shape (d,) bias for the second feed forward layer where it is broadcasted across the n tokens and added dimension wise
dropout_ff = nn.Dropout(p=0.2)

print(f"\nW_ff1 (Feed forward layer 1 weights) shape: {W_ff1.shape}")
print(W_ff1)
print(f"\nb_ff1 (Feed forward layer 1 bias) shape: {b_ff1.shape}")
print(b_ff1)
print(f"\nW_ff2 (Feed forward layer 2 weights) shape: {W_ff2.shape}")
print(W_ff2)
print(f"\nb_ff2 (Feed forward layer 2 bias) shape: {b_ff2.shape}")
print(b_ff2)


W_ff1 (Feed forward layer 1 weights) shape: torch.Size([4, 16])
tensor([[-0.0667, -0.3741, -0.1459, -0.1008,  0.0110,  0.1480,  0.2656,  0.0547,
          0.5679,  0.1334,  0.0409,  0.0926,  0.4985,  0.0525,  0.1825,  0.4893],
        [ 0.8556, -0.2231,  0.0586, -0.4005,  0.1184,  0.5118, -0.2382,  0.1126,
          0.0229, -0.0638, -0.1699,  0.1197,  0.4108, -0.0756,  0.5701,  0.5085],
        [ 0.0917,  0.0873,  0.1226, -0.3783,  0.2750, -0.0084, -0.0655,  0.0499,
         -0.0261, -0.3541,  0.4638,  0.1634,  0.2980,  0.1520, -0.0419, -0.3542],
        [ 0.5943, -0.0314,  0.1471, -0.1312, -0.3660, -0.1756,  0.1999, -0.0223,
          0.2980,  0.0809, -0.5495,  0.1071,  0.2743,  0.6565, -0.2955, -0.7465]])

b_ff1 (Feed forward layer 1 bias) shape: torch.Size([16])
tensor([-0.1424, -0.0696, -0.0318,  0.1215,  0.1420, -0.0055,  0.0025, -0.1064,
        -0.0364, -0.0099,  0.0311,  0.0371,  0.0270,  0.0790,  0.0945, -0.1582])

W_ff2 (Feed forward layer 2 weights) shape: torch.Size([16, 4

In [44]:
def feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2):
    ffn_layer1 = torch.matmul(layernorm_output, W_ff1) + b_ff1 # Shape (n, hidden_dim)
    print("\nFeed forward layer 1 output shape before ReLU:", ffn_layer1.shape)
    print(ffn_layer1)

    ffn_layer1_activated = F.gelu(ffn_layer1) # Shape (n, hidden_dim). GeLU is applied element wise to the output of the first feed forward layer
    print("\nFeed forward layer 1 output shape after ReLU:", ffn_layer1_activated.shape)
    ffn_layer1_activated = dropout_ff(ffn_layer1_activated)  # Apply dropout to the activated output of the first feed forward layer
    print(ffn_layer1_activated)

    ffn_layer2 = torch.matmul(ffn_layer1_activated, W_ff2) + b_ff2 # Shape (n, d)
    ffn_layer2 = dropout_ff(ffn_layer2)  # Apply dropout to the output of the second feed forward layer
    print("\nFeed forward layer 2 output shape:", ffn_layer2.shape)
    print(ffn_layer2)

    return ffn_layer2

In [45]:
ffn_output = feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2)


Feed forward layer 1 output shape before ReLU: torch.Size([9, 16])
tensor([[ 0.9638, -0.2541,  0.0418,  0.3096, -0.6207,  0.0692,  0.1029, -0.1538,
          0.1861,  0.5237, -1.4098, -0.0333,  0.0055,  0.5603,  0.1031, -0.3520],
        [ 1.2587, -0.2484,  0.0670, -0.0203, -0.1197,  0.5975, -0.3917, -0.0414,
         -0.2658,  0.2263, -0.9475,  0.0028,  0.0472, -0.0202,  0.7571,  0.4013],
        [ 0.3669,  0.6019,  0.4164, -0.0518, -0.0367, -0.5535, -0.1912, -0.2018,
         -0.6342, -0.4079, -0.1323,  0.1020, -0.3317,  0.7848, -0.6667, -2.0625],
        [-0.2730, -0.1560, -0.0665,  0.6263, -0.7306, -0.6776,  0.7084, -0.2672,
          0.6618,  0.5520, -0.9689, -0.0333, -0.0192,  0.9617, -0.7561, -1.1461],
        [-1.1970,  0.3403,  0.0540,  0.2717,  0.1373, -0.8819,  0.4046, -0.2320,
          0.0574, -0.2718,  0.6399,  0.0802, -0.1430,  0.6143, -0.9532, -1.5983],
        [ 1.0548,  0.1251,  0.2631,  0.1775, -0.5704, -0.2099, -0.0307, -0.1917,
         -0.1760,  0.2189, -1.2407, 

In [46]:
final_output = residualPlusLayerNorm(ffn_output, layernorm_output, gamma2, beta2)

print("\nFinal output shape:", final_output.shape)
print(final_output)


Final output shape: torch.Size([9, 4])
tensor([[ 0.3518,  0.3761, -1.6811,  0.9532],
        [ 0.0662,  1.0780, -1.6159,  0.4716],
        [-1.2074, -0.7282,  1.2309,  0.7047],
        [ 0.9624, -1.5468, -0.2131,  0.7974],
        [ 0.4372, -1.4138,  1.3047, -0.3281],
        [-0.0773,  0.3589, -1.5238,  1.2422],
        [ 1.4181, -1.2953, -0.4509,  0.3281],
        [-1.4351,  1.3783, -0.1117,  0.1685],
        [ 0.1541, -0.5898,  1.5470, -1.1113]])


In [18]:
## This completes one block of the transformer architecture (decoder). In practice, multiple such blocks are stacked to form a deep transformer model.