import numpy as np
def tokenize_and_embed(tokens: np.ndarray, weight_matrix: np.ndarray, bias: np.ndarray) -> np.ndarray: """ Embeds input tokens using matrix multiplication with weights and bias.
Parameters:
tokens: Input token vector
weight_matrix: Embedding weight matrix
bias: Embedding bias vector
Returns:
np.ndarray: Embedded token vector
"""
return np.dot(weight_matrix, tokens) + bias
def scaled_dot_product_attention(queries: np.ndarray, keys: np.ndarray, values: np.ndarray) -> np.ndarray: """ Computes the scaled dot-product attention for basic transformer-like systems.
Parameters:
queries: Query matrix
keys: Key matrix
values: Value matrix
Returns:
np.ndarray: Attention-applied value matrix
"""
d_k = keys.shape[-1]
scores = np.dot(queries, keys.T) / np.sqrt(d_k)
attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=1, keepdims=True)
return np.dot(attention_weights, values)
def forward_propagation(input_data: np.ndarray, weight_matrix: np.ndarray, bias: np.ndarray) -> np.ndarray: """ Applies a dense layer with ReLU activation.
Parameters:
input_data: Input vector
weight_matrix: Weight matrix
bias: Bias vector
Returns:
np.ndarray: Output after ReLU activation
"""
z = np.dot(weight_matrix, input_data) + bias
return np.maximum(0, z)
def gradient_update(weight_matrix: np.ndarray, input_data: np.ndarray, loss_gradient: np.ndarray, learning_rate: float = 0.01) -> np.ndarray: """ Performs a basic gradient descent step to update the weights.
Parameters:
weight_matrix: Current weights
input_data: Inputs for gradient calculation
loss_gradient: Gradient from loss
learning_rate: Learning rate (default: 0.01)
Returns:
np.ndarray: Updated weight matrix
"""
gradients = np.dot(loss_gradient, input_data.T)
return weight_matrix - learning_rate * gradients