In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
## Implemented using the Attention notebooks in the Attention directory

def selfAttention(input_embeddings, W_q, W_k, W_v, W_o):
    n = input_embeddings.shape[0]
    d_model = input_embeddings.shape[1]
    d_k = W_q.shape[1]

    Q = torch.matmul(input_embeddings, W_q)
    K = torch.matmul(input_embeddings, W_k)
    V = torch.matmul(input_embeddings, W_v)

    mask  = torch.tril(torch.ones(n, n))

    attention_scores = torch.matmul(Q, K.T)
    masked_attention_scores = attention_scores.masked_fill(mask == 0, float('-inf'))
    masked_attention_scores /= torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

    attention_weights = F.softmax(masked_attention_scores, dim=-1)
    dropout = nn.Dropout(p=0.2)
    attention_weights = dropout(attention_weights)
    output = torch.matmul(attention_weights, V)
    final_output = torch.matmul(output, W_o)
    final_output = dropout(final_output)

    return final_output

In [3]:
## Implemented using the LayerNorm notebook in the LayerNorm directory
def residualPlusLayerNorm(attention_output, input_embeddings, gamma, beta, eps = 1e-5,):
    residual_output = attention_output + input_embeddings

    means = torch.mean(residual_output, dim=-1, keepdim=True) # Shape (n, 1)
    variances = torch.var(residual_output, dim=-1, keepdim=True, unbiased=False) # Shape (n, 1)
    normalized = (residual_output - means) / torch.sqrt(variances + eps) # Shape (n, d)

    ln_output = normalized * gamma + beta
    return ln_output

In [None]:
## Implemented using the FeedForwardNetwork notebook in the Transformer directory
def feedForwardNetwork(layernorm_output, W_ff1, b_ff1, W_ff2, b_ff2):
    ffn_layer1 = torch.matmul(layernorm_output, W_ff1) + b_ff1
    ffn_layer1_activated = F.gelu(ffn_layer1)
    ffn_layer1_activated = nn.Dropout(p=0.2)(ffn_layer1_activated)

    ffn_layer2 = torch.matmul(ffn_layer1_activated, W_ff2) + b_ff2
    ffn_layer2 = nn.Dropout(p=0.2)(ffn_layer2)
    return ffn_layer2

In [5]:
sentence = "The quick brown fox jumps over the lazy dog"

## Simple tokenization by splitting on spaces, ideally more complex tokenization would be used like BPE or WordPiece
sentence = sentence.split()
n = len(sentence)

print(f"Tokenized sentence: {sentence}")
print(f"Number of tokens: {len(sentence)}")

Tokenized sentence: ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
Number of tokens: 9


In [6]:
## Sample vocabulary, ideally this would be a much larger vocabulary
vocab = ["<PAD>", "<START>", "<END>", "<UNK>"] + sentence + ["cat", "runs", "fast", "slowly", "and", "the", "a", "is", "are", "was", "were"]

vocab = list(dict.fromkeys(vocab)) # Remove duplicates
vocab_size = len(vocab)

print(f"Vocabulary size: {vocab_size}")
print(f"Vocabulary: {vocab}")

Vocabulary size: 23
Vocabulary: ['<PAD>', '<START>', '<END>', '<UNK>', 'The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', 'cat', 'runs', 'fast', 'slowly', 'and', 'a', 'is', 'are', 'was', 'were']


In [7]:
## Creatinga a word to index and index to word mapping to represent text as integers
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

print(f"Word to index mapping: {word_to_idx}")
print(f"Index to word mapping: {idx_to_word}")

Word to index mapping: {'<PAD>': 0, '<START>': 1, '<END>': 2, '<UNK>': 3, 'The': 4, 'quick': 5, 'brown': 6, 'fox': 7, 'jumps': 8, 'over': 9, 'the': 10, 'lazy': 11, 'dog': 12, 'cat': 13, 'runs': 14, 'fast': 15, 'slowly': 16, 'and': 17, 'a': 18, 'is': 19, 'are': 20, 'was': 21, 'were': 22}
Index to word mapping: {0: '<PAD>', 1: '<START>', 2: '<END>', 3: '<UNK>', 4: 'The', 5: 'quick', 6: 'brown', 7: 'fox', 8: 'jumps', 9: 'over', 10: 'the', 11: 'lazy', 12: 'dog', 13: 'cat', 14: 'runs', 15: 'fast', 16: 'slowly', 17: 'and', 18: 'a', 19: 'is', 20: 'are', 21: 'was', 22: 'were'}


In [8]:
## Convert the sentence to a list of indices
input_indices = [word_to_idx[word] for word in sentence]

print(f"Input indices: {input_indices}")

Input indices: [4, 5, 6, 7, 8, 9, 10, 11, 12]


In [9]:
d_model = 8      # embedding dimension
d_k = 6          # attention dimension
hidden_dim = 16  # feed forward network hidden dimension

In [10]:
## Sample embedding matrix, ideally this would be learned during training or loaded from a pre-trained model
torch.manual_seed(42)
embedding_matrix = torch.randn(vocab_size, d_model) * 0.3 # Shape (vocab_size, d_model) i.e. for every token in the vocab, we have a d_model dimensional embedding

print(f"Embedding matrix shape: {embedding_matrix.shape}")
print(f"Embedding matrix: {embedding_matrix}")


Embedding matrix shape: torch.Size([23, 8])
Embedding matrix: tensor([[ 5.7807e-01,  4.4619e-01,  2.7022e-01, -6.3166e-01,  2.0353e-01,
         -3.7036e-01, -1.2920e-02, -4.8140e-01],
        [-2.2564e-01,  4.9462e-01, -1.1774e-01, -4.2108e-01, -2.1836e-01,
         -1.6783e-01, -2.3065e-01,  2.2873e-01],
        [ 4.9270e-01, -4.7879e-02, -1.4922e-01,  1.3188e-01, -2.2744e-01,
          3.2350e-01,  2.4024e-01,  5.0419e-01],
        [ 3.8374e-01,  3.8893e-01,  1.8314e-01,  4.0042e-01, -6.9487e-02,
          1.2528e-02, -7.5473e-02,  2.5796e-01],
        [-4.1540e-01, -2.6137e-01, -6.7010e-02,  5.1521e-01,  9.5664e-02,
         -1.2736e-01,  9.1716e-02, -2.3238e-01],
        [-4.6727e-01,  2.9869e-01, -2.6394e-01, -1.8034e-01, -3.8225e-01,
          6.3684e-01, -3.7040e-01, -1.4637e-01],
        [-2.7415e-01, -1.9744e-01,  2.3407e-02,  1.5774e-01, -1.4640e-01,
          3.5741e-01, -2.4420e-01, -2.2080e-01],
        [-4.2097e-01,  1.0801e-02, -1.9043e-02,  2.0268e-01, -2.9342e-02,
   

In [11]:
## Using the embedding matrix to convert input indices to embeddings
input_embeddings = embedding_matrix[input_indices]

print(f"Input embeddings shape: {input_embeddings.shape}")
print(f"Input embeddings: {input_embeddings}")

Input embeddings shape: torch.Size([9, 8])
Input embeddings: tensor([[-0.4154, -0.2614, -0.0670,  0.5152,  0.0957, -0.1274,  0.0917, -0.2324],
        [-0.4673,  0.2987, -0.2639, -0.1803, -0.3822,  0.6368, -0.3704, -0.1464],
        [-0.2741, -0.1974,  0.0234,  0.1577, -0.1464,  0.3574, -0.2442, -0.2208],
        [-0.4210,  0.0108, -0.0190,  0.2027, -0.0293,  0.5534, -0.3554,  0.4151],
        [ 0.4335,  0.2569,  0.6654,  0.1569,  0.1040, -0.0592, -0.3164,  0.3834],
        [-0.0517,  0.1571,  0.0170,  0.1279,  0.1725, -0.1925, -0.6619, -0.2252],
        [ 0.0033, -0.1016, -0.4022, -0.1756,  0.1609,  0.1574,  0.3424,  0.0155],
        [ 0.2232, -0.1445, -0.3148,  0.1812, -0.5167, -0.2483,  0.4004,  0.1451],
        [-0.7529,  0.1464,  0.2354,  0.0086,  0.1922,  0.1750,  0.3201, -0.1350]])


In [12]:
## Sample positional embeddings, ideally these would be learned during training
positional_embeddings = torch.randn(n, d_model) * 0.2
input_embeddings = input_embeddings + positional_embeddings
dropout = nn.Dropout(p=0.2)
input_embeddings = dropout(input_embeddings)


print(f"Input embeddings with positional encoding shape: {input_embeddings.shape}")
print(f"Input embeddings with positional encoding: {input_embeddings}")

Input embeddings with positional encoding shape: torch.Size([9, 8])
Input embeddings with positional encoding: tensor([[-0.5591, -0.4329,  0.1523,  0.5978,  0.3848, -0.1071, -0.0298, -0.2091],
        [-0.5186,  0.1834, -0.8415, -0.6078, -0.3766,  0.9540, -0.3849, -0.1913],
        [-0.0169, -0.0000,  0.3128,  0.1083, -0.0000,  0.9466, -0.0000, -0.0000],
        [-0.5209, -0.1938, -0.2940,  0.0000,  0.0901,  0.0000, -0.3332,  0.3378],
        [ 0.4266,  0.3052,  0.4901,  0.2786, -0.1157,  0.0014, -0.3508,  0.4469],
        [-0.4584,  0.7591,  0.2715,  0.5009,  0.3740, -0.1394, -0.7420, -0.3369],
        [ 0.0473,  0.1358, -0.5009, -0.2389,  0.3617,  0.3403,  0.5746,  0.0147],
        [-0.0000, -0.0241, -1.0398,  0.2205, -0.0000, -0.0000,  0.0000,  0.0000],
        [-0.6428,  0.6673,  0.0000,  0.2560,  0.3439,  0.5079,  0.4674, -0.0000]])


In [13]:
## Self Attention weights for the K, Q and V matrices
W_q = torch.randn(d_model, d_k) * 0.3 # Shape (d_model, d_k)
W_k = torch.randn(d_model, d_k) * 0.3 # Shape (d_model, d_k)
W_v = torch.randn(d_model, d_k) * 0.3 # Shape (d_model, d_k)
W_o = torch.randn(d_k, d_model) * 0.3 # Shape (d_k, d_model)

In [14]:
## Layer Norm weights for both before and after the feed forward network
gamma1 = torch.ones(d_model) # Shape (d_model,)
beta1 = torch.zeros(d_model) # Shape (d_model,)
gamma2 = torch.ones(d_model) # Shape (d_model,)
beta2 = torch.zeros(d_model) # Shape (d_model,)

In [15]:
## Feed Forward Network weights
W_ff1 = torch.randn(d_model, hidden_dim) * 0.3 # Shape (d_model, hidden_dim)
b_ff1 = torch.randn(hidden_dim) * 0.3 # Shape (hidden_dim,)
W_ff2 = torch.randn(hidden_dim, d_model) * 0.3 # Shape (hidden_dim, d_model)
b_ff2 = torch.randn(d_model) * 0.3 # Shape (d_model,)

In [16]:
attention_output = selfAttention(input_embeddings, W_q, W_k, W_v, W_o)

print(f"Attention output shape: {attention_output.shape}")
print(f"Attention output: {attention_output}")

Attention output shape: torch.Size([9, 8])
Attention output: tensor([[ 0.1218, -0.2799,  0.8330, -0.0000,  0.2833,  0.6572, -0.4778,  0.0000],
        [-0.0373,  0.3466,  0.3091,  0.2591,  0.5623,  0.0320,  0.4537, -0.1644],
        [ 0.0000,  0.1883,  0.3220,  0.2420,  0.0000,  0.0789,  0.6034, -0.0000],
        [ 0.0086,  0.1428,  0.3652,  0.1505,  0.3699,  0.1306,  0.0000, -0.1026],
        [ 0.0225,  0.0891,  0.0000,  0.1145,  0.2571,  0.1018,  0.2894, -0.1312],
        [ 0.0887, -0.0000,  0.0000,  0.0152,  0.0965,  0.0000,  0.0629, -0.1278],
        [ 0.0000,  0.0571,  0.0000,  0.0839,  0.1720,  0.0666,  0.2225, -0.1060],
        [-0.0000,  0.1464,  0.1313,  0.1089,  0.0000,  0.0000,  0.2304, -0.0000],
        [ 0.0405,  0.0428,  0.1954, -0.0017,  0.1252,  0.1464, -0.0161, -0.0000]])


In [17]:
layernorm_output1 = residualPlusLayerNorm(attention_output, input_embeddings, gamma1, beta1)

print(f"LayerNorm output shape: {layernorm_output1.shape}")
print(f"LayerNorm output: {layernorm_output1}")

LayerNorm output shape: torch.Size([9, 8])
LayerNorm output: tensor([[-0.9096, -1.3619,  1.4257,  0.7895,  0.9050,  0.7112, -1.0250, -0.5350],
        [-1.0714,  1.0317, -1.0256, -0.6699,  0.3650,  1.9149,  0.1387, -0.6835],
        [-1.0288, -0.4504,  0.8077,  0.0060, -0.9812,  1.9087,  0.7191, -0.9812],
        [-1.8151, -0.2389,  0.1787,  0.4498,  1.5076,  0.3819, -1.2032,  0.7392],
        [ 0.9360,  0.6353,  1.1599,  0.6293, -0.7486, -0.9580, -1.8591,  0.2053],
        [-0.8402,  1.4437,  0.4572,  0.9520,  0.8597, -0.3742, -1.4660, -1.0323],
        [-0.2729,  0.1001, -1.6771, -0.7911,  0.9731,  0.6481,  1.6478, -0.6280],
        [ 0.0802,  0.4268, -2.4951,  1.0139,  0.0802,  0.0802,  0.7334,  0.0802],
        [-2.1949,  1.1207, -0.1796, -0.0309,  0.5118,  0.9795,  0.4667, -0.6733]])


In [18]:
feedforward_output = feedForwardNetwork(layernorm_output1, W_ff1, b_ff1, W_ff2, b_ff2)

print(f"Feed Forward Network output shape: {feedforward_output.shape}")
print(f"Feed Forward Network output: {feedforward_output}")

Feed Forward Network output shape: torch.Size([9, 8])
Feed Forward Network output: tensor([[ 0.0000, -1.4984,  0.1647,  0.0000, -2.2981, -0.9139, -0.5148, -0.2643],
        [ 0.0495, -0.0000, -1.8511,  1.5845,  1.0706,  0.7957, -0.8447, -1.8805],
        [ 0.0000, -0.3541, -0.6420, -0.3679, -0.3375, -0.0000, -1.6751, -0.5623],
        [ 1.2290, -1.4242, -1.2835,  0.7632, -1.7395,  0.8066, -0.0000, -0.0000],
        [-0.3013, -1.5876, -0.0000,  0.0955, -0.2977,  0.7533, -0.0859, -0.3359],
        [ 2.1650, -1.7628, -0.4042,  0.0000, -2.2275,  1.1990, -0.6661, -0.6499],
        [ 1.2242, -1.2095, -2.8590,  4.0536,  0.0400,  4.1343, -1.6785, -2.2926],
        [ 0.0000, -0.0000, -2.2299,  0.5013,  0.0000,  1.2136, -0.6072, -0.0000],
        [ 0.8520, -0.2528, -2.0101,  2.3380, -0.6107,  2.5460, -1.2218, -1.3421]])


In [19]:
layernorm_output2 = residualPlusLayerNorm(feedforward_output, layernorm_output1, gamma2, beta2)

print(f"Final LayerNorm output shape: {layernorm_output2.shape}")
print(f"Final LayerNorm output: {layernorm_output2}")

Final LayerNorm output shape: torch.Size([9, 8])
Final LayerNorm output: tensor([[-0.1870, -1.6827,  1.7297,  1.1157, -0.5578,  0.3549, -0.6702, -0.1025],
        [-0.4781,  0.6284, -1.4775,  0.5653,  0.8460,  1.5330, -0.3079, -1.3090],
        [-0.5173, -0.3010,  0.6345,  0.1258, -0.7968,  2.3154, -0.4471, -1.0136],
        [-0.3599, -1.3797, -0.8510,  1.3437, -0.0245,  1.3205, -0.9442,  0.8951],
        [ 0.8694, -0.7450,  1.4038,  0.9610, -0.8406,  0.0155, -1.7549,  0.0909],
        [ 1.3209, -0.0211,  0.2827,  1.0166, -0.8771,  0.9128, -1.5011, -1.1338],
        [ 0.2707, -0.4493, -1.6466,  1.0782,  0.2923,  1.6092, -0.0724, -1.0821],
        [ 0.1215,  0.3124, -2.5259,  0.9121,  0.1215,  0.7901,  0.1468,  0.1215],
        [-0.7159,  0.4308, -1.1551,  1.1773, -0.0706,  1.8093, -0.4111, -1.0648]])


In [20]:
## Weights for the final linear layer to project to vocab size i.e. the language model head
W_lm_head = torch.randn(d_model, vocab_size) * 0.3 # Shape (d_model, vocab_size)
b_lm_head = torch.randn(vocab_size) * 0.1 # Shape (vocab_size,)

print(f"LM head weights shape: {W_lm_head.shape}")
print(f"LM head bias shape: {b_lm_head.shape}")

LM head weights shape: torch.Size([8, 23])
LM head bias shape: torch.Size([23])


In [21]:
## Passing the final output through the LM head to get logits for each token in the vocabulary
logits = torch.matmul(layernorm_output2, W_lm_head) + b_lm_head # Shape (n, vocab_size)

print(f"Logits shape: {logits.shape}") 

Logits shape: torch.Size([9, 23])


In [22]:
## Applying softmax to get probabilities for each token in the vocabulary, where each row represents the probabilities for each word in the vocab for the next token given the previous tokens
probabilities = F.softmax(logits, dim=-1)

print(f"Probabilities shape: {probabilities.shape}")
print(f"Probabilities: {probabilities}")

Probabilities shape: torch.Size([9, 23])
Probabilities: tensor([[0.0557, 0.0177, 0.1232, 0.0408, 0.0224, 0.2360, 0.0191, 0.0245, 0.0175,
         0.0056, 0.0596, 0.0256, 0.0242, 0.0069, 0.0076, 0.0331, 0.1046, 0.0062,
         0.0231, 0.0837, 0.0228, 0.0242, 0.0159],
        [0.0167, 0.0356, 0.0039, 0.0197, 0.0758, 0.0092, 0.0298, 0.0162, 0.0780,
         0.0367, 0.0536, 0.0316, 0.0746, 0.0271, 0.0418, 0.1728, 0.0055, 0.0463,
         0.0384, 0.1243, 0.0313, 0.0226, 0.0086],
        [0.0111, 0.0597, 0.0132, 0.0144, 0.0674, 0.0825, 0.0382, 0.0328, 0.0195,
         0.0055, 0.0270, 0.0276, 0.1094, 0.0096, 0.0116, 0.1526, 0.0677, 0.0101,
         0.0648, 0.0806, 0.0609, 0.0216, 0.0121],
        [0.0669, 0.0753, 0.0348, 0.0370, 0.0295, 0.0854, 0.0111, 0.0192, 0.0655,
         0.0220, 0.0802, 0.0357, 0.0408, 0.0232, 0.1192, 0.0492, 0.0503, 0.0253,
         0.0085, 0.0701, 0.0252, 0.0140, 0.0117],
        [0.0385, 0.0157, 0.0631, 0.0350, 0.0303, 0.2713, 0.0361, 0.0351, 0.0211,
         0.0123

In [23]:
row_sum = probabilities.sum(dim=-1, keepdim=True)  # Validate trhat each row sums to 1 (pdf)
print(row_sum)

tensor([[1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000],
        [1.0000]])
