In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
## Implemented using the Attention notebooks in the Attention directory

def selfAttention(input_embeddings, W_q, W_k, W_v, W_o):
    n = input_embeddings.shape[0]
    d_model = input_embeddings.shape[1]
    d_k = W_q.shape[1]

    Q = torch.matmul(input_embeddings, W_q)
    K = torch.matmul(input_embeddings, W_k)
    V = torch.matmul(input_embeddings, W_v)

    mask  = torch.tril(torch.ones(n, n))

    attention_scores = torch.matmul(Q, K.T)
    masked_attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    masked_attention_scores /= torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    attention_weights = F.softmax(masked_attention_scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    final_output = torch.matmul(output, W_o)

    return final_output

In [12]:
## Implemented using the LayerNorm notebook in the LayerNorm directory
def residualPlusLayerNorm(attention_output, input_embeddings, gamma, beta, eps = 1e-5,):
    residual_output = attention_output + input_embeddings

    means = torch.mean(residual_output, dim=-1, keepdim=True) # Shape (n, 1)
    variances = torch.var(residual_output, dim=-1, keepdim=True, unbiased=False) # Shape (n, 1)
    normalized = (residual_output - means) / torch.sqrt(variances + eps) # Shape (n, d)

    ln_output = normalized * gamma + beta
    return ln_output

In [4]:
sentence = "The quick brown fox jumps over the lazy dog"

## Simple tokenization by splitting on spaces, ideally more complex tokenization would be used like BPE or WordPiece
sentence = sentence.split()
n = len(sentence)

print(f"Tokenized sentence: {sentence}")
print(f"Number of tokens: {len(sentence)}")

Tokenized sentence: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9


In [5]:
## Sample word embeddings, ideally these would be learned in the language modelling process or loaded from a pre-trained model like GloVe or Word2Vec

# Shape of embeddings: (n, d) where n is number of tokens and d is embedding dimension
embeddings = torch.tensor([
        [1.0, 0.5, 0.2, 0.8], 
        [0.3, 1.0, 0.7, 0.1],  
        [0.6, 0.2, 1.0, 0.4],  
        [0.9, 0.8, 0.3, 1.0],  
        [0.4, 0.6, 0.8, 0.2],  
        [0.7, 0.3, 0.5, 0.9],  
        [1.0, 0.5, 0.2, 0.8],  
        [0.2, 0.9, 0.4, 0.6],  
        [0.8, 0.4, 0.9, 0.3]  
    ])

print("Word embeddings (4-dimensional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {embeddings[i]}")
print()

Word embeddings (4-dimensional):
  The : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  quick: tensor([0.3000, 1.0000, 0.7000, 0.1000])
  brown: tensor([0.6000, 0.2000, 1.0000, 0.4000])
  fox : tensor([0.9000, 0.8000, 0.3000, 1.0000])
  jumps: tensor([0.4000, 0.6000, 0.8000, 0.2000])
  over: tensor([0.7000, 0.3000, 0.5000, 0.9000])
  the : tensor([1.0000, 0.5000, 0.2000, 0.8000])
  lazy: tensor([0.2000, 0.9000, 0.4000, 0.6000])
  dog : tensor([0.8000, 0.4000, 0.9000, 0.3000])



In [6]:
## Sample positional encodings, typically these would be generated using math functions or learned during training or RoPE

positional_embeddings = torch.tensor([
    [0.0, 1.0, 0.0, 1.0],  
    [0.1, 0.9, 0.1, 0.9],  
    [0.2, 0.8, 0.2, 0.8],  
    [0.3, 0.7, 0.3, 0.7],  
    [0.4, 0.6, 0.4, 0.6],  
    [0.5, 0.5, 0.5, 0.5],  
    [0.6, 0.4, 0.6, 0.4],  
    [0.7, 0.3, 0.7, 0.3],  
    [0.8, 0.2, 0.8, 0.2]   
])

print("Positional embeddings (Same dimesnions as word embeddings):")
for i, word in enumerate(sentence):
    print(f"  Pos {i} ({word}): {positional_embeddings[i]}")
print()

Positional embeddings (Same dimesnions as word embeddings):
  Pos 0 (The): tensor([0., 1., 0., 1.])
  Pos 1 (quick): tensor([0.1000, 0.9000, 0.1000, 0.9000])
  Pos 2 (brown): tensor([0.2000, 0.8000, 0.2000, 0.8000])
  Pos 3 (fox): tensor([0.3000, 0.7000, 0.3000, 0.7000])
  Pos 4 (jumps): tensor([0.4000, 0.6000, 0.4000, 0.6000])
  Pos 5 (over): tensor([0.5000, 0.5000, 0.5000, 0.5000])
  Pos 6 (the): tensor([0.6000, 0.4000, 0.6000, 0.4000])
  Pos 7 (lazy): tensor([0.7000, 0.3000, 0.7000, 0.3000])
  Pos 8 (dog): tensor([0.8000, 0.2000, 0.8000, 0.2000])



In [7]:
## The final input to the Attention block is the sum of the word embeddings and positional encodings

input_embeddings = embeddings + positional_embeddings

print("Input embeddings (word + positional):")
for i, word in enumerate(sentence):
    print(f"  {word:4}: {input_embeddings[i]}")
print()

Input embeddings (word + positional):
  The : tensor([1.0000, 1.5000, 0.2000, 1.8000])
  quick: tensor([0.4000, 1.9000, 0.8000, 1.0000])
  brown: tensor([0.8000, 1.0000, 1.2000, 1.2000])
  fox : tensor([1.2000, 1.5000, 0.6000, 1.7000])
  jumps: tensor([0.8000, 1.2000, 1.2000, 0.8000])
  over: tensor([1.2000, 0.8000, 1.0000, 1.4000])
  the : tensor([1.6000, 0.9000, 0.8000, 1.2000])
  lazy: tensor([0.9000, 1.2000, 1.1000, 0.9000])
  dog : tensor([1.6000, 0.6000, 1.7000, 0.5000])



In [8]:
d_model = input_embeddings.shape[1]  # Embedding dimension
d_k = 3 # Dimension of keys and queries (generally kept smaller to make Q, K and V matrices low rank for efficiency)

In [9]:
torch.manual_seed(42)  # For reproducible results

## Shape of the Q, K and V matrices is d x d_k and for the output projection matrix is d_k x d to project the attention output back to d dimensions
W_q = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_k = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_v = torch.randn(d_model, d_k, dtype=torch.float32) * 0.3  
W_o = torch.randn(d_k, d_model, dtype=torch.float32) * 0.3 

print(f"W_q (Query weights) shape: {W_q.shape}")
print(W_q)
print(f"\nW_k (Key weights) shape: {W_k.shape}")
print(W_k)
print(f"\nW_v (Value weights) shape: {W_v.shape}")
print(W_v)
print()

print(f"W_o (Output projection weights) shape: {W_o.shape}")
print(W_o)

W_q (Query weights) shape: torch.Size([4, 3])
tensor([[ 0.1010,  0.0386,  0.0703],
        [ 0.0691, -0.3369, -0.0559],
        [ 0.6625, -0.1914,  0.1385],
        [ 0.0802,  0.1605,  0.2428]])

W_k (Key weights) shape: torch.Size([4, 3])
tensor([[ 0.3331, -0.5069, -0.2967],
        [ 0.2874,  0.3966,  0.2452],
        [-0.2298, -0.2252,  0.4058],
        [ 0.2059, -0.0983,  0.2385]])

W_v (Value weights) shape: torch.Size([4, 3])
tensor([[ 0.0845,  0.0168,  0.1568],
        [-0.0715, -0.0150,  0.1579],
        [-0.0025,  0.2187,  0.0399],
        [ 0.2592, -0.3047, -0.2666]])

W_o (Output projection weights) shape: torch.Size([3, 4])
tensor([[ 0.0449, -0.0627, -0.1161,  0.2974],
        [ 0.1404, -0.0615, -0.2223,  0.1086],
        [ 0.5760, -0.0676, -0.1025,  0.0912]])


In [10]:
attention_output = selfAttention(input_embeddings, W_q, W_k, W_v, W_o)

print("\nAttention output shape:", attention_output.shape)
print(attention_output)


Attention output shape: torch.Size([9, 4])
tensor([[-0.0968,  0.0089,  0.0700,  0.0693],
        [-0.0290,  0.0011,  0.0407,  0.0572],
        [-0.0174, -0.0035,  0.0238,  0.0640],
        [-0.0211, -0.0036,  0.0262,  0.0700],
        [-0.0030, -0.0066,  0.0158,  0.0706],
        [-0.0044, -0.0076,  0.0127,  0.0743],
        [ 0.0043, -0.0094,  0.0083,  0.0785],
        [ 0.0124, -0.0108,  0.0039,  0.0788],
        [ 0.0287, -0.0139, -0.0053,  0.0823]])


In [11]:
## The parameters for the layer norm layer that will be applied after the attention layer
gamma1 = torch.ones(d_model, dtype=torch.float32)
beta1 = torch.zeros(d_model, dtype=torch.float32)

## The parameters for the layer norm layer that will be applied after the feed forward layer
gamma2 = torch.ones(d_model, dtype=torch.float32)
beta2 = torch.zeros(d_model, dtype=torch.float32)

In [13]:
layernorm_output = residualPlusLayerNorm(attention_output, input_embeddings, gamma1, beta1)

print("\nLayerNorm output shape:", layernorm_output.shape)
print(layernorm_output)


LayerNorm output shape: torch.Size([9, 4])
tensor([[-0.3856,  0.6098, -1.4263,  1.2021],
        [-1.2114,  1.5489, -0.3639,  0.0265],
        [-1.4708, -0.3633,  0.8130,  1.0211],
        [-0.2092,  0.5371, -1.5082,  1.1802],
        [-1.1857,  0.9293,  1.0492, -0.7928],
        [ 0.3073, -1.3051, -0.4240,  1.4219],
        [ 1.4387, -0.7991, -1.0569,  0.4173],
        [-1.2430,  1.3310,  0.5375, -0.6255],
        [ 0.9378, -0.9955,  1.0603, -1.0026]])


In [None]:
hidden_dim = 16  # Dimension of the hidden layer in the feed-forward network which is typically 4 times d_model

W_ff1 = torch.randn(d_model, hidden_dim) * 0.3    # Shape (d, hidden_dim) to expand from nxd_model to nxhidden_dim
b_ff1 = torch.randn(hidden_dim) * 0.1 # Shape (hidden_dim,) bias for the first feed forward layer where it is broadcasted across the n tokens and added dimension wise
W_ff2 = torch.randn(hidden_dim, d_model) * 0.3    # Shape (hidden_dim, d) to project back from nxhidden_dim to nxd_model
b_ff2 = torch.randn(d_model) * 0.1 # Shape (d,) bias for the second feed forward layer where it is broadcasted across the n tokens and added dimension wise

print(f"\nW_ff1 (Feed forward layer 1 weights) shape: {W_ff1.shape}")
print(W_ff1)
print(f"\nb_ff1 (Feed forward layer 1 bias) shape: {b_ff1.shape}")
print(b_ff1)
print(f"\nW_ff2 (Feed forward layer 2 weights) shape: {W_ff2.shape}")
print(W_ff2)
print(f"\nb_ff2 (Feed forward layer 2 bias) shape: {b_ff2.shape}")
print(b_ff2)


W_ff1 (Feed forward layer 1 weights) shape: torch.Size([4, 16])
tensor([[-7.5286e-01,  1.4640e-01,  2.3538e-01,  8.5942e-03,  1.9223e-01,
          1.7497e-01,  3.2008e-01, -1.3505e-01, -5.5580e-02,  2.2583e-01,
          1.2143e-01,  5.3540e-02,  7.9473e-02,  3.8195e-01, -3.9326e-04,
         -9.1081e-02],
        [-4.3711e-01, -3.0701e-02, -1.7975e-01,  1.4312e-01,  2.1785e-01,
          2.7346e-02, -1.1672e-01,  1.5837e-01, -3.8056e-03,  7.2251e-02,
          3.9761e-02,  2.2927e-01,  3.2850e-01,  1.0197e-01,  2.1599e-01,
          1.2342e-01],
        [ 5.7935e-01,  3.0356e-01, -4.3092e-01, -3.3896e-01, -4.0810e-02,
          4.9062e-01,  1.9642e-01,  1.7280e-01,  3.4245e-01,  5.5694e-03,
         -5.4174e-01,  2.7763e-01, -1.1260e-01,  3.0993e-01, -2.0600e-01,
          1.9104e-01],
        [-2.9180e-01,  2.8754e-01,  4.8576e-01,  4.3518e-01,  8.0844e-02,
         -6.3113e-02, -2.1984e-01,  3.1289e-02,  1.0463e-01,  2.9028e-01,
         -1.3971e-01,  4.8144e-01, -7.4404e-01, -1.2

In [None]:
def feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2):
    ffn_layer1 = torch.matmul(layernorm_output, W_ff1) + b_ff1 # Shape (n, hidden_dim)
    print("\nFeed forward layer 1 output shape before ReLU:", ffn_layer1.shape)
    print(ffn_layer1)

    ffn_layer1_activated = F.relu(ffn_layer1) # Shape (n, hidden_dim). ReLU is applied element wise to the output of the first feed forward layer
    print("\nFeed forward layer 1 output shape after ReLU:", ffn_layer1_activated.shape)
    print(ffn_layer1_activated)

    ffn_layer2 = torch.matmul(ffn_layer1_activated, W_ff2) + b_ff2 # Shape (n, d)
    print("\nFeed forward layer 2 output shape:", ffn_layer2.shape)
    print(ffn_layer2)

    return ffn_layer2

In [16]:
ffn_output = feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2)


Feed forward layer 1 output shape before ReLU: torch.Size([9, 16])
tensor([[-1.3434, -0.1396,  1.0007,  1.0559,  0.2428, -0.8995, -0.7215, -0.1696,
         -0.5038,  0.4333,  0.7110,  0.3072, -0.7188, -0.6021,  0.0721,  0.3335],
        [-0.1736, -0.3049, -0.3914,  0.3116,  0.1502, -0.4229, -0.6284,  0.2375,
         -0.2206, -0.0207,  0.2368,  0.2072,  0.2791, -0.3452,  0.4781,  0.4411],
        [ 1.2491,  0.3591, -0.1327,  0.0696, -0.2838, -0.0059, -0.4757,  0.2041,
          0.3082,  0.0778, -0.6473,  0.5605, -1.2422, -0.3991, -0.5340,  0.6959],
        [-1.4855, -0.1427,  1.0799,  1.0653,  0.2625, -0.9094, -0.6679, -0.2198,
         -0.5436,  0.4611,  0.7770,  0.2667, -0.7032, -0.5647,  0.0811,  0.2875],
        [ 1.1356, -0.0887, -1.2809, -0.6124, -0.1037,  0.3096, -0.0902,  0.3544,
          0.1785, -0.2896, -0.4358,  0.0644,  0.5281,  0.1420,  0.3469,  0.4326],
        [-0.5115,  0.3880,  1.1828,  0.5438, -0.0643, -0.3528, -0.1277, -0.3864,
         -0.1688,  0.5208,  0.1454, 

In [17]:
final_output = residualPlusLayerNorm(ffn_output, layernorm_output, gamma2, beta2)

print("\nFinal output shape:", final_output.shape)
print(final_output)


Final output shape: torch.Size([9, 4])
tensor([[-0.0513,  0.1443, -1.4558,  1.3628],
        [-1.1552,  1.5250, -0.5534,  0.1836],
        [-1.3559, -0.5060,  0.6376,  1.2243],
        [ 0.0944,  0.1004, -1.5049,  1.3101],
        [-1.2715,  1.1546,  0.7807, -0.6638],
        [ 0.4127, -1.2556, -0.5527,  1.3956],
        [ 1.2178, -0.5522, -1.3304,  0.6648],
        [-1.1364,  1.4807,  0.3057, -0.6500],
        [ 0.7280, -0.8624,  1.2328, -1.0984]])


In [18]:
## This completes one block of the transformer architecture (decoder). In practice, multiple such blocks are stacked to form a deep transformer model.