In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
## Implemented using the Attention notebooks in the Attention directory

def selfAttention(input_embeddings, W_q, W_k, W_v, W_o):
    n = input_embeddings.shape[0]
    d_model = input_embeddings.shape[1]
    d_k = W_q.shape[1]

    Q = torch.matmul(input_embeddings, W_q)
    K = torch.matmul(input_embeddings, W_k)
    V = torch.matmul(input_embeddings, W_v)
    dropout = nn.Dropout(p=0.2)

    mask  = torch.tril(torch.ones(n, n))

    attention_scores = torch.matmul(Q, K.T)
    masked_attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    masked_attention_scores /= torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    attention_weights = F.softmax(masked_attention_scores, dim=-1)
    attention_weights = dropout(attention_weights)
    output = torch.matmul(attention_weights, V)
    final_output = torch.matmul(output, W_o)
    final_output = dropout(final_output)

    return final_output

In [15]:
## Implemented using the LayerNorm notebook in the LayerNorm directory
def residualPlusLayerNorm(attention_output, input_embeddings, gamma, beta, eps = 1e-5,):
    residual_output = attention_output + input_embeddings

    means = torch.mean(residual_output, dim=-1, keepdim=True) # Shape (n, 1)
    variances = torch.var(residual_output, dim=-1, keepdim=True, unbiased=False) # Shape (n, 1)
    normalized = (residual_output - means) / torch.sqrt(variances + eps) # Shape (n, d)

    ln_output = normalized * gamma + beta
    return ln_output

In [16]:
sentence = "The quick brown fox jumps over the lazy dog"

## Simple tokenization by splitting on spaces, ideally more complex tokenization would be used like BPE or WordPiece
sentence = sentence.split()
n = len(sentence)

print(f"Tokenized sentence: {sentence}")
print(f"Number of tokens: {len(sentence)}")

Tokenized sentence: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9


In [17]:
## Sample word embeddings, ideally these would be learned in the language modelling process or loaded from a pre-trained model like GloVe or Word2Vec

# Shape of embeddings: (n, d) where n is number of tokens and d is embedding dimension
embeddings = torch.tensor([
        [1.0, 0.5, 0.2, 0.8], 
        [0.3, 1.0, 0.7, 0.1],  
        [0.6, 0.2, 1.0, 0.4],  
        [0.9, 0.8, 0.3, 1.0],  
        [0.4, 0.6, 0.8, 0.2],  
        [0.7, 0.3, 0.5, 0.9],  
        [1.0, 0.5, 0.2, 0.8],  
        [0.2, 0.9, 0.4, 0.6],  
        [0.8, 0.4, 0.9, 0.3]  
    ])

print("Word embeddings (4-dimensional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {embeddings[i]}")
print()

Word embeddings (4-dimensional):
  The : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  quick: tensor([0.3000, 1.0000, 0.7000, 0.1000])
  brown: tensor([0.6000, 0.2000, 1.0000, 0.4000])
  fox : tensor([0.9000, 0.8000, 0.3000, 1.0000])
  jumps: tensor([0.4000, 0.6000, 0.8000, 0.2000])
  over: tensor([0.7000, 0.3000, 0.5000, 0.9000])
  the : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  lazy: tensor([0.2000, 0.9000, 0.4000, 0.6000])
  dog : tensor([0.8000, 0.4000, 0.9000, 0.3000])



In [18]:
## Sample positional encodings, typically these would be generated using math functions or learned during training or RoPE

positional_embeddings = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],  
    [0.1, 0.9, 0.1, 0.9],  
    [0.2, 0.8, 0.2, 0.8],  
    [0.3, 0.7, 0.3, 0.7],  
    [0.4, 0.6, 0.4, 0.6],  
    [0.5, 0.5, 0.5, 0.5],  
    [0.6, 0.4, 0.6, 0.4],  
    [0.7, 0.3, 0.7, 0.3],  
    [0.8, 0.2, 0.8, 0.2]   
])

print("Positional embeddings (Same dimesnions as word embeddings):")
for i, word in enumerate(sentence):
    print(f"  Pos {i} ({word}): {positional_embeddings[i]}")
print()

Positional embeddings (Same dimesnions as word embeddings):
  Pos 0 (The): tensor([0., 1., 0., 1.])
  Pos 1 (quick): tensor([0.1000, 0.9000, 0.1000, 0.9000])
  Pos 2 (brown): tensor([0.2000, 0.8000, 0.2000, 0.8000])
  Pos 3 (fox): tensor([0.3000, 0.7000, 0.3000, 0.7000])
  Pos 4 (jumps): tensor([0.4000, 0.6000, 0.4000, 0.6000])
  Pos 5 (over): tensor([0.5000, 0.5000, 0.5000, 0.5000])
  Pos 6 (the): tensor([0.6000, 0.4000, 0.6000, 0.4000])
  Pos 7 (lazy): tensor([0.7000, 0.3000, 0.7000, 0.3000])
  Pos 8 (dog): tensor([0.8000, 0.2000, 0.8000, 0.2000])



In [19]:
## The final input to the Attention block is the sum of the word embeddings and positional encodings
dropout = nn.Dropout(p=0.2)
input_embeddings = embeddings + positional_embeddings
input_embeddings = dropout(input_embeddings)

print("Input embeddings (word + positional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {input_embeddings[i]}")
print()

Input embeddings (word + positional):
  The : tensor([1.2500, 0.0000, 0.0000, 2.2500])
  quick: tensor([0.5000, 2.3750, 0.0000, 1.2500])
  brown: tensor([0.0000, 1.2500, 1.5000, 1.5000])
  fox : tensor([1.5000, 1.8750, 0.7500, 2.1250])
  jumps: tensor([1.0000, 1.5000, 1.5000, 1.0000])
  over: tensor([1.5000, 1.0000, 1.2500, 1.7500])
  the : tensor([2.0000, 1.1250, 1.0000, 1.5000])
  lazy: tensor([1.1250, 1.5000, 1.3750, 1.1250])
  dog : tensor([2.0000, 0.7500, 2.1250, 0.0000])



In [20]:
d_model = input_embeddings.shape[1]  # Embedding dimension
d_k = 3 # Dimension of keys and queries (generally kept smaller to make Q, K and V matrices low rank for efficiency)

In [21]:
torch.manual_seed(42)  # For reproducible results

## Shape of the Q, K and V matrices is d x d_k and for the output projection matrix is d_k x d to project the attention output back to d dimensions
W_q = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_k = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_v = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_o = torch.randn(d_k, d_model, dtype=torch.float32) * 0.3 

print(f"W_q (Query weights) shape: {W_q.shape}")
print(W_q)
print(f"\nW_k (Key weights) shape: {W_k.shape}")
print(W_k)
print(f"\nW_v (Value weights) shape: {W_v.shape}")
print(W_v)
print()

print(f"W_o (Output projection weights) shape: {W_o.shape}")
print(W_o)

W_q (Query weights) shape: torch.Size([4, 3])
tensor([[ 0.1010,  0.0386,  0.0703],
        [ 0.0691, -0.3369, -0.0559],
        [ 0.6625, -0.1914,  0.1385],
        [ 0.0802,  0.1605,  0.2428]])

W_k (Key weights) shape: torch.Size([4, 3])
tensor([[ 0.3331, -0.5069, -0.2967],
        [ 0.2874,  0.3966,  0.2452],
        [-0.2298, -0.2252,  0.4058],
        [ 0.2059, -0.0983,  0.2385]])

W_v (Value weights) shape: torch.Size([4, 3])
tensor([[ 0.0845,  0.0168,  0.1568],
        [-0.0715, -0.0150,  0.1579],
        [-0.0025,  0.2187,  0.0399],
        [ 0.2592, -0.3047, -0.2666]])

W_o (Output projection weights) shape: torch.Size([3, 4])
tensor([[ 0.0449, -0.0627, -0.1161,  0.2974],
        [ 0.1404, -0.0615, -0.2223,  0.1086],
        [ 0.5760, -0.0676, -0.1025,  0.0912]])


In [22]:
attention_output = selfAttention(input_embeddings, W_q, W_k, W_v, W_o)

print("\nAttention output shape:", attention_output.shape)
print(attention_output)


Attention output shape: torch.Size([9, 4])
tensor([[-0.4609,  0.0000,  0.0000,  0.1498],
        [-0.2590,  0.0261,  0.0000,  0.1045],
        [-0.2106,  0.0147,  0.0693,  0.0813],
        [-0.1512,  0.0088,  0.0871,  0.1181],
        [-0.0072, -0.0095,  0.0098,  0.0866],
        [-0.0805, -0.0063,  0.0318,  0.1259],
        [ 0.0090, -0.0152,  0.0132,  0.1248],
        [-0.0414, -0.0062,  0.0311,  0.1051],
        [ 0.0000, -0.0181,  0.0077,  0.0000]])


In [23]:
## The parameters for the layer norm layer that will be applied after the attention layer
gamma1 = torch.ones(d_model, dtype=torch.float32)
beta1 = torch.zeros(d_model, dtype=torch.float32)

## The parameters for the layer norm layer that will be applied after the feed forward layer
gamma2 = torch.ones(d_model, dtype=torch.float32)
beta2 = torch.zeros(d_model, dtype=torch.float32)

In [24]:
layernorm_output = residualPlusLayerNorm(attention_output, input_embeddings, gamma1, beta1)

print("\nLayerNorm output shape:", layernorm_output.shape)
print(layernorm_output)


LayerNorm output shape: torch.Size([9, 4])
tensor([[-0.0083, -0.8137, -0.8137,  1.6357],
        [-0.7921,  1.4646, -1.0438,  0.3713],
        [-1.7063,  0.2887,  0.7007,  0.7169],
        [-0.4302,  0.5731, -1.3898,  1.2469],
        [-1.1908,  0.9478,  1.0308, -0.7877],
        [ 0.0840, -1.2528, -0.3483,  1.5171],
        [ 1.4144, -0.8176, -1.0575,  0.4608],
        [-1.3864,  1.2012,  0.6478, -0.4625],
        [ 0.8808, -0.5442,  1.0299, -1.3665]])


In [25]:
hidden_dim = 16  # Dimension of the hidden layer in the feed-forward network which is typically 4 times d_model

W_ff1 = torch.randn(d_model, hidden_dim) * 0.3    # Shape (d, hidden_dim) to expand from nxd_model to nxhidden_dim
b_ff1 = torch.randn(hidden_dim) * 0.1 # Shape (hidden_dim,) bias for the first feed forward layer where it is broadcasted across the n tokens and added dimension wise
W_ff2 = torch.randn(hidden_dim, d_model) * 0.3    # Shape (hidden_dim, d) to project back from nxhidden_dim to nxd_model
b_ff2 = torch.randn(d_model) * 0.1 # Shape (d,) bias for the second feed forward layer where it is broadcasted across the n tokens and added dimension wise
dropout_ff = nn.Dropout(p=0.2)

print(f"\nW_ff1 (Feed forward layer 1 weights) shape: {W_ff1.shape}")
print(W_ff1)
print(f"\nb_ff1 (Feed forward layer 1 bias) shape: {b_ff1.shape}")
print(b_ff1)
print(f"\nW_ff2 (Feed forward layer 2 weights) shape: {W_ff2.shape}")
print(W_ff2)
print(f"\nb_ff2 (Feed forward layer 2 bias) shape: {b_ff2.shape}")
print(b_ff2)


W_ff1 (Feed forward layer 1 weights) shape: torch.Size([4, 16])
tensor([[-0.0667, -0.3741, -0.1459, -0.1008,  0.0110,  0.1480,  0.2656,  0.0547,
          0.5679,  0.1334,  0.0409,  0.0926,  0.4985,  0.0525,  0.1825,  0.4893],
        [ 0.8556, -0.2231,  0.0586, -0.4005,  0.1184,  0.5118, -0.2382,  0.1126,
          0.0229, -0.0638, -0.1699,  0.1197,  0.4108, -0.0756,  0.5701,  0.5085],
        [ 0.0917,  0.0873,  0.1226, -0.3783,  0.2750, -0.0084, -0.0655,  0.0499,
         -0.0261, -0.3541,  0.4638,  0.1634,  0.2980,  0.1520, -0.0419, -0.3542],
        [ 0.5943, -0.0314,  0.1471, -0.1312, -0.3660, -0.1756,  0.1999, -0.0223,
          0.2980,  0.0809, -0.5495,  0.1071,  0.2743,  0.6565, -0.2955, -0.7465]])

b_ff1 (Feed forward layer 1 bias) shape: torch.Size([16])
tensor([-0.1424, -0.0696, -0.0318,  0.1215,  0.1420, -0.0055,  0.0025, -0.1064,
        -0.0364, -0.0099,  0.0311,  0.0371,  0.0270,  0.0790,  0.0945, -0.1582])

W_ff2 (Feed forward layer 2 weights) shape: torch.Size([16, 4

In [28]:
def feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2):
    ffn_layer1 = torch.matmul(layernorm_output, W_ff1) + b_ff1 # Shape (n, hidden_dim)
    print("\nFeed forward layer 1 output shape before ReLU:", ffn_layer1.shape)
    print(ffn_layer1)

    ffn_layer1_activated = F.relu(ffn_layer1) # Shape (n, hidden_dim). ReLU is applied element wise to the output of the first feed forward layer
    print("\nFeed forward layer 1 output shape after ReLU:", ffn_layer1_activated.shape)
    ffn_layer1_activated = dropout_ff(ffn_layer1_activated)  # Apply dropout to the activated output of the first feed forward layer
    print(ffn_layer1_activated)

    ffn_layer2 = torch.matmul(ffn_layer1_activated, W_ff2) + b_ff2 # Shape (n, d)
    ffn_layer2 = dropout_ff(ffn_layer2)  # Apply dropout to the output of the second feed forward layer
    print("\nFeed forward layer 2 output shape:", ffn_layer2.shape)
    print(ffn_layer2)

    return ffn_layer2

In [29]:
ffn_output = feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2)


Feed forward layer 1 output shape before ReLU: torch.Size([9, 16])
tensor([[ 5.9478e-02, -7.4387e-03,  6.2545e-02,  5.4143e-01, -7.7687e-01,
         -7.0360e-01,  5.7442e-01, -2.7551e-01,  4.4899e-01,  4.6127e-01,
         -1.1073e+00, -1.8807e-02, -1.0533e-01,  1.0902e+00, -8.2028e-01,
         -1.5090e+00],
        [ 1.2885e+00, -2.0279e-01,  9.6221e-02, -3.9086e-02, -1.1629e-01,
          5.7048e-01, -4.1414e-01, -4.5242e-02, -3.1493e-01,  1.9056e-01,
         -9.3831e-01,  8.2218e-03,  2.4594e-02,  1.1769e-02,  7.1889e-01,
          2.9163e-01],
        [ 7.0874e-01,  5.4292e-01,  4.2531e-01, -1.8124e-01,  8.7604e-02,
         -2.4206e-01, -4.2212e-01, -1.4832e-01, -8.0352e-01, -4.4613e-01,
         -1.5677e-01,  1.0491e-01, -2.9955e-01,  5.4469e-01, -2.9363e-01,
         -1.6297e+00],
        [ 9.9027e-01, -1.9699e-01,  7.7588e-02,  2.9747e-01, -6.3347e-01,
          1.6834e-02,  9.2060e-02, -1.6257e-01,  1.4018e-01,  4.8902e-01,
         -1.4137e+00, -2.7710e-02, -2.4247e-02,  

In [30]:
final_output = residualPlusLayerNorm(ffn_output, layernorm_output, gamma2, beta2)

print("\nFinal output shape:", final_output.shape)
print(final_output)


Final output shape: torch.Size([9, 4])
tensor([[-0.1643, -1.4539,  0.2848,  1.3334],
        [-0.6035,  1.5409, -1.1098,  0.1724],
        [-1.5909,  0.2955,  0.1268,  1.1686],
        [ 0.3898,  0.2581, -1.6603,  1.0124],
        [-0.3772,  0.8226,  1.0186, -1.4640],
        [ 0.5771, -1.7277,  0.4755,  0.6751],
        [ 1.5252, -1.2720, -0.2353, -0.0179],
        [-1.2980,  1.4666,  0.1897, -0.3582],
        [ 0.8719, -0.5328,  1.0343, -1.3734]])


In [18]:
## This completes one block of the transformer architecture (decoder). In practice, multiple such blocks are stacked to form a deep transformer model.