In [1]:
import numpy as np

In [2]:
def positional_encoding(seq_len, d_model):
    '''
        Generate positional encoding for input sequences
    '''
    pe = np.zeros((seq_len, d_model)) # Positional encoding matrix
    position = np.arange(0, seq_len)[:, np.newaxis] 
    div_term = np.exp(np.arange(0, d_model, 2) * -(np.log(10000.0) / d_model))
    pe[:, 0::2] = np.sin(position * div_term)
    pe[:, 1::2] = np.cos(position * div_term)
    return pe

$$

    Attention(Q, K, V) = softmax(\frac{QK^{T}}{\sqrt{d_k}})V

$$

In [3]:
def scaled_dot_product_attention(query, key, value, mask=None):
    '''
        Compute the scaled dot-product attention
    '''
    d_k = query.shape[-1]
    scores = np.matmul(query, key.transpose(0, 1, 3, 2)) / np.sqrt(d_k)
    if mask is not None:
        scores = scores + (mask * -1e9)
    attention_weights = np.exp(scores) / np.sum(np.exp(scores), axis=-1, keepdims=True)
    attention_output = np.matmul(attention_weights, value)
    return attention_output, attention_weights