# Tiny Transformer Components in NumPy
This notebook implements the following components using Python and NumPy:
1. Token Embedding (Lookup table)
2. Scaled Dot-Product Attention (with Causal Mask)
3. Layer Normalization
4. Dropout
5. Feed-Forward Neural Network
6. Skip Connections
7. Output Layer (Vocab Projection)
8. Cross-Entropy Loss
9. Text Generation Strategies: Temperature, Top-k, Top-p Sampling


In [ ]:
import numpy as np
np.random.seed(42)


In [ ]:
# 1. Token Embedding
vocab = {0: '<PAD>', 1: 'hello', 2: 'world', 3: 'good', 4: 'morning'}
vocab_size = len(vocab)
embedding_dim = 4
embedding_matrix = np.random.randn(vocab_size, embedding_dim)

def tokens_to_embeddings(tokens):
    return embedding_matrix[tokens]

def embeddings_to_tokens(embeddings):
    logits = embeddings @ embedding_matrix.T
    return np.argmax(logits, axis=-1)

# Test
input_tokens = np.array([1, 3, 4])
print('Embeddings:', tokens_to_embeddings(input_tokens))

In [ ]:
# 2. Scaled Dot-Product Attention with Causal Mask
def softmax(x):
    exp = np.exp(x - np.max(x, axis=-1, keepdims=True))
    return exp / np.sum(exp, axis=-1, keepdims=True)

def causal_mask(size):
    return np.tril(np.ones((size, size)))

def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = Q @ K.T / np.sqrt(d_k)
    mask = causal_mask(Q.shape[0])
    scores = np.where(mask == 1, scores, -np.inf)
    weights = softmax(scores)
    return weights @ V

# Test
x = tokens_to_embeddings(input_tokens)
Q = K = V = x
print('Attention Output:', scaled_dot_product_attention(Q, K, V))

In [ ]:
# 3. Layer Normalization
def layer_norm(x, eps=1e-6):
    mean = np.mean(x, axis=-1, keepdims=True)
    std = np.std(x, axis=-1, keepdims=True)
    return (x - mean) / (std + eps)

# Test
print('LayerNorm:', layer_norm(x))

In [ ]:
# 4. Dropout
def dropout(x, p=0.1):
    mask = (np.random.rand(*x.shape) > p).astype(np.float32)
    return x * mask / (1 - p)

# Test
print('Dropout:', dropout(x))

In [ ]:
# 5. Feed-Forward Neural Network
def relu(x):
    return np.maximum(0, x)

def feed_forward(x, W1, b1, W2, b2):
    return relu(x @ W1 + b1) @ W2 + b2

# Initialize params
W1 = np.random.randn(embedding_dim, 8)
b1 = np.random.randn(8)
W2 = np.random.randn(8, embedding_dim)
b2 = np.random.randn(embedding_dim)

# Test
print('FFN:', feed_forward(x, W1, b1, W2, b2))

In [ ]:
# 6. Transformer Block with Skip Connections
def transformer_block(x, W_q, W_k, W_v, W1, b1, W2, b2):
    Q, K, V = x @ W_q, x @ W_k, x @ W_v
    attn = scaled_dot_product_attention(Q, K, V)
    x = layer_norm(x + attn)
    ffn = feed_forward(x, W1, b1, W2, b2)
    x = layer_norm(x + ffn)
    return x

# Initialize weights for block
W_q = np.random.randn(embedding_dim, embedding_dim)
W_k = np.random.randn(embedding_dim, embedding_dim)
W_v = np.random.randn(embedding_dim, embedding_dim)

# Test
print('Transformer Block:', transformer_block(x, W_q, W_k, W_v, W1, b1, W2, b2))

In [ ]:
# 7. Output Layer
def output_layer(x, W_out, b_out):
    return x @ W_out + b_out

# Initialize
W_out = np.random.randn(embedding_dim, vocab_size)
b_out = np.random.randn(vocab_size)

# Test
print('Logits:', output_layer(x, W_out, b_out))

In [ ]:
# 8. Cross-Entropy Loss
def cross_entropy_loss(logits, targets):
    probs = softmax(logits)
    loss = -np.log(probs[np.arange(len(targets)), targets])
    return np.mean(loss)

# Test
targets = np.array([3, 4, 2])
print('Loss:', cross_entropy_loss(output_layer(x, W_out, b_out), targets))

In [ ]:
# 9. Text Generation Strategies
def temperature_sampling(logits, T=1.0):
    scaled = logits / T
    return np.random.choice(len(logits), p=softmax(scaled))

def top_k_sampling(logits, k=2):
    idx = np.argsort(logits)[-k:]
    probs = softmax(logits[idx])
    return np.random.choice(idx, p=probs)

def top_p_sampling(logits, p=0.9):
    sorted_idx = np.argsort(logits)[::-1]
    sorted_probs = softmax(logits[sorted_idx])
    cum_probs = np.cumsum(sorted_probs)
    cutoff = np.searchsorted(cum_probs, p)
    selected = sorted_idx[:cutoff+1]
    probs = softmax(logits[selected])
    return np.random.choice(selected, p=probs)

# Example
logits = output_layer(x, W_out, b_out)
print('Temp sample:', temperature_sampling(logits[0]))  # sample token