In [1]:
import numpy as np

# Tokenisation

Tokenization is the process of converting text into smaller units (tokens) that a model can process. The type of tokenization used depends on the model and application:

**1. Word Tokenization:**
- Splits text into words based on spaces and punctuation.
- Example: "The cat sat." → ["The", "cat", "sat", "."]
- Simple but limited (out-of-vocabulary words cause issues).

**2. Character Tokenization:**
- Breaks text into individual characters.
- Example: "Hello" → ["H", "e", "l", "l", "o"]
- Good for languages with complex morphology (e.g., Chinese) but inefficient for long texts.

**3. Byte Pair Encoding (BPE) Tokenization:**
- A subword tokenization method.
- Common words are kept as full tokens, while rare words are split into smaller subword units.
- Example: "unhappiness" → ["un", "happiness"] (if "happiness" is frequent)
- GPT-2 uses BPE tokenization, balancing vocabulary size and efficiency.

In [None]:
import re
from collections import defaultdict, Counter

In [3]:
def word_tokenize(text):
    return text.split()

text = 'The cat is unhappy'
word_tokenize(text)

['The', 'cat', 'is', 'unhappy']

In [5]:
def char_tokenize(text):
    return list(text)

text = 'The cat is unhappy'
char_tokenize(text)

['T',
 'h',
 'e',
 ' ',
 'c',
 'a',
 't',
 ' ',
 'i',
 's',
 ' ',
 'u',
 'n',
 'h',
 'a',
 'p',
 'p',
 'y']

In [9]:
def buid_vocab(text):
    words = text.split()  # Split by spaces
    words+=["UNK"]  # Add UNK token
    vocab = sorted(set(words))  # Unique words sorted
    word_to_id = {word: idx for idx, word in enumerate(vocab)} # create a dict
    id_to_word = {idx: word for word, idx in word_to_id.items()} # reverse the dict
    return word_to_id, id_to_word

def encode(text, word_to_id):
    words = text.split()  # Split by spaces
    unknown_id = word_to_id.get("UNK")
    return [word_to_id.get(w, unknown_id) for w in words]

def decode(ids, id_to_word):
    return [id_to_word.get(i, "UNK") for i in ids]
    

In [10]:
d1, d2 = buid_vocab("Hello, How are you")
print(d1)
print(d2)

{'Hello,': 0, 'How': 1, 'UNK': 2, 'are': 3, 'you': 4}
{0: 'Hello,', 1: 'How', 2: 'UNK', 3: 'are', 4: 'you'}


In [11]:
encode("Hello, are you George ?", d1)

[0, 3, 4, 2, 2]

In [12]:
decode([0, 3, 4, 2, 2], d2)

['Hello,', 'are', 'you', 'UNK', 'UNK']

# Word Embeddings + Positional Embeddings

In [14]:
# Example usage:
vocab_size = 10000
embed_dim = 768  
max_seq_len = 1024

In [17]:
word_embeddings = np.random.randn(vocab_size, embed_dim)
word_embeddings.shape

(10000, 768)

In [18]:
positional_embeddings = np.random.randn(max_seq_len, embed_dim)
positional_embeddings.shape

(1024, 768)

In [22]:
input_text = "Hello, how are you doing"
input_ids = encode(input_text, d1)
seq_len = len(input_ids)

word_emb = word_embeddings[input_ids]
pos_emb = positional_embeddings[:seq_len]

print(word_emb.shape)
print(pos_emb.shape)

final_emb = word_emb + pos_emb
final_emb

(5, 768)
(5, 768)


array([[-0.61887313,  0.09400312, -3.32888607, ..., -0.21328245,
         0.57585842, -0.00361055],
       [-0.55717445, -2.77855797, -0.98569963, ..., -1.24417084,
        -0.41394384,  3.50313267],
       [ 1.56149326,  1.90120072, -0.33256547, ..., -0.98567718,
        -0.37131668,  0.12742174],
       [-0.33594186, -1.47447108,  2.3595145 , ...,  1.75077137,
         0.91849738,  0.17863543],
       [-1.93755778, -0.12762102,  0.20301741, ..., -0.63949476,
        -1.55418361,  1.94200523]])

# Multi-Head Self-Attention (MHA) with Causal Mask

Multi-Head Self-Attention (MHA) allows the model to learn different relationships between words. GPT-2 uses *causal attention*, meaning each token can only attend to past tokens (not future ones).

Here's a breakdown of how it works:

1. **Linear Projections:**
   - The input embeddings are transformed into three matrices: Query (Q), Key (K), and Value (V).  These are created by multiplying the input embeddings by learned weight matrices.

2. **Scaled Dot-Product Attention:**
   - Attention scores are calculated using the following formula:

     $$
     Attention(Q, K, V) = softmax( (QKᵀ / √dₖ) + mask ) V
     $$

     Where:
     - `Q`, `K`, and `V` are the Query, Key, and Value matrices.
     - `dₖ` is the dimension of the Key vectors.  Scaling by the square root of `dₖ` helps stabilize training.
     - `mask` is the causal mask.

3. **Causal Mask:**
   - The mask is crucial for causal attention. It prevents each token from attending to future tokens.  The mask effectively sets the attention scores for future tokens to a very large negative number (often -∞), which, after the softmax operation, makes their attention weights close to zero.

4. **Multi-Head Processing:**
   - The Q, K, and V matrices are split into multiple "heads." Each head performs the scaled dot-product attention calculation independently. This allows the model to capture different types of relationships between words in parallel.

5. **Final Linear Projection:**
   - The outputs from all the heads are concatenated.  This concatenated output is then projected back to the original embedding dimension using another learned weight matrix.  This final projection combines the information learned by the different attention heads.

In [23]:
x = final_emb
x.shape

(5, 768)

In [55]:
def get_attention(Q, K, V, mask):
    attn_scores = (Q @ K.T) / np.sqrt(Q.shape[1]) + mask
    attn_scores = np.exp(attn_scores) / np.sum(np.exp(attn_scores), axis=-1, keepdims=True)
    return attn_scores @ V

In [None]:
# Example Usage:
# assert embed_dim % num_heads == 0, "K must be divisible by number of heads"
embed_dim = 768
num_heads = 12
head_dim = embed_dim // num_heads

seq_len = x.shape[0] # length if input sequqnce. (number of tokens in input)

W_q = np.random.randn(embed_dim, embed_dim) 
W_k = np.random.randn(embed_dim, embed_dim)
W_v = np.random.randn(embed_dim, embed_dim) 
W_o = np.random.randn(embed_dim, embed_dim) 

# x: (seq_len, embed_dim) - Input embeddings
# Compute Q, K, V
Q = x @ W_q  # Shape: (seq_len, embed_dim)
K = x @ W_k
V = x @ W_v

# Compute Q, K, V
print(Q.shape)
print(K.shape)
print(V.shape)

causal_mask = np.triu(np.ones((seq_len, seq_len)) * -1e9, k=1)
# print(causal_mask)

Q = Q.reshape(seq_len, num_heads, head_dim)
K = K.reshape(seq_len, num_heads, head_dim)
V = V.reshape(seq_len, num_heads, head_dim)
print(Q.shape) # Now these are 12 matrices of size 5x64. (num_heads matrices of dimension seq_len x  head_dim)
print(K.shape)
print(V.shape)

# Compute attention for each head
attention_outputs = np.stack([   get_attention(Q[:,i,:], K[:,i,:], V[:,i,:], causal_mask) for i in range(num_heads)  ])
print(attention_outputs.shape)
output = attention_outputs.transpose(1, 0, 2).reshape(seq_len, embed_dim)
print(output.shape)

output = output @ W_o

print(output.shape)  # Should be (seq_len, embed_dim)


(5, 768)
(5, 768)
(5, 768)
(5, 12, 64)
(5, 12, 64)
(5, 12, 64)
(12, 5, 64)
(5, 768)
(5, 768)


  attn_scores = np.exp(attn_scores) / np.sum(np.exp(attn_scores), axis=-1, keepdims=True)
  attn_scores = np.exp(attn_scores) / np.sum(np.exp(attn_scores), axis=-1, keepdims=True)


# Layer Norm + Residual Connections + Fully Connected NN

In [61]:
def layer_norm(x, gamma, beta, eps=1e-5):
    mean = np.mean(x, axis=-1, keepdims=True)
    variance = np.var(x, axis=-1, keepdims=True)
    return gamma * (x - mean) / np.sqrt(variance + eps) + beta

def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

def linear(x, W, b):
    return x @ W + b

def feed_forward(x, W1, b1, W2, b2):
    return linear(gelu(linear(x, W1, b1)), W2, b2)

In [62]:
# Parameters (learned during training)
gamma = np.ones(embed_dim)  # Scale parameter for LayerNorm
beta = np.zeros(embed_dim)  # Shift parameter for LayerNorm

W1 = np.random.randn(embed_dim, 4 * embed_dim)  # First FC layer weights
b1 = np.zeros(4 * embed_dim)  # First FC layer bias
W2 = np.random.randn(4 * embed_dim, embed_dim)  # Second FC layer weights
b2 = np.zeros(embed_dim)  # Second FC layer bias

In [63]:
# Residual Connection after Multi-Head Attention
print(x.shape) # input_embeddings  # (seq_len, embed_dim)
print(output.shape) # attention_output = multi_head_attention(x)  # (seq_len, embed_dim)

(5, 768)
(5, 768)


In [69]:
output_residual_plus_layer_norm = layer_norm(x + output, gamma, beta)  # Apply residual + LayerNorm 
output_residual_plus_layer_norm.shape # (seq_len, embed_dim)

(5, 768)

In [71]:
ffn_output = feed_forward(output_residual_plus_layer_norm, W1, b1, W2, b2)  # (seq_len, embed_dim)
ffn_output.shape

(5, 768)

In [73]:
# Residual Connection after Feed-Forward Network
output_another_layer_norm = layer_norm(x + ffn_output, gamma, beta)
output_another_layer_norm.shape

(5, 768)

In [77]:
H = output_another_layer_norm

In [79]:
logits = H @ word_embeddings.T 
logits.shape # Shape: (seq_len, vocab_size)

(5, 10000)

In [80]:
probs = np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
probs

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [81]:
next_token = np.argmax(probs[-1])
next_token

0