In [None]:
from typing import TypeAlias
import numpy as np 
import numpy.typing

len_seq = 1024
n_head = 2
d_model = 64
d_in = 128
d_h = d_model // n_head

rng = np.random.default_rng(0x2025_07_13)

Tensor32: TypeAlias = numpy.typing.NDArray[np.float32]

class MultiHeadTransformer:
    def __init__(self, d_model: int = 768, n_head: int = 12, n_layer: int = 12, seed: int | None = None):
        self.seed = seed
        self.rng = np.random.default_rng(self.seed)

        self.d_model = d_model
        self.n_head = n_head
        assert d_model % n_head == 0, "model dim must be divisible by number of heads"
        self.d_h = d_model // n_head

        self.n_layer = n_layer

        # Key Weights
        self.W_K: Tensor32 = self.rng.normal(size=(self.n_layer, self.d_model, self.n_head * self.d_h)).astype(np.float32)
        # Value Weights
        self.W_V: Tensor32 = self.rng.normal(size=(self.n_layer, self.d_model, self.n_head * self.d_h)).astype(np.float32)
        # Query Weights
        self.W_Q: Tensor32 = self.rng.normal(size=(self.n_layer, self.d_model, self.n_head * self.d_h)).astype(np.float32)
        # Output Weights
        self.W_O: Tensor32 = self.rng.normal(size=(self.n_layer, self.n_head * self.d_h, self.d_model)).astype(np.float32)

    def _forward_block(self, input: Tensor32, layer: int) -> Tensor32:
        len_seq, d_model_input = input.shape
        assert d_model_input == self.d_model, f"{d_model_input=}!={self.d_model=}"

        # Split off the weights of the corresponding layer
        W_Q = self.W_Q[layer]
        W_K = self.W_K[layer]
        W_V = self.W_V[layer]
        W_O = self.W_O[layer]

        # Q
        queries = np.matmul(input, W_Q).reshape(self.n_head, len_seq, self.d_h)
        # K
        keys = np.matmul(input, W_K).reshape(self.n_head, len_seq, self.d_h)
        # V
        values = np.matmul(input, W_V).reshape(self.n_head, len_seq, self.d_h)

        # QK^T
        similarities = np.matmul(queries, keys.transpose(0, 2, 1))
        # QK^T / sqrt(d_H)
        similarities /= np.sqrt(self.d_h).astype(np.float32)

        # softmax(QK^T / sqrt(d_H))
        attention_weights = np.exp(similarities - similarities.max(axis = -1, keepdims=True))
        attention_weights /= attention_weights.sum(axis=-1, keepdims=True)

        # softmax(QK^T / sqrt(d_H)) * V
        attention_output = np.matmul(attention_weights, values)
        attention_output = attention_output.transpose(1, 0, 2).reshape(len_seq, self.n_head * self.d_h)

        output = np.matmul(attention_output, W_O)
        assert output.shape == input.shape
        # Residual Connection
        return input + output

    def _forward(self, input: Tensor32) -> Tensor32:
        output = input
        for layer in range(self.n_layer):
            output = self._forward_block(output, layer)
        return output

In [42]:
768 % 12

0