In [1]:
#SCALED DOT PRODUCT
from IPython import get_ipython
from IPython.display import display
from tensorflow import matmul, cast, float32, math
from tensorflow.math import sqrt
from tensorflow.keras.layers import Layer
from tensorflow.keras.activations import softmax
import numpy as np

class DotProductAttention(Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)

  def call(self, queries, keys, values, *, d_k, mask=None):
    scores = matmul(queries, keys, transpose_b=True) / sqrt(cast(d_k, float32))
    if mask is not None:
      scores += -1e9 * mask
    weights = softmax(scores)
    return matmul(weights, values)

batch_size = 32
input_seq_length = 10
d_k = 64
d_v = 64

random = np.random.default_rng(seed=42)
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))
attention = DotProductAttention()
print(attention(queries, keys, values, d_k=d_k))

tf.Tensor(
[[[0.46824062 0.5564131  0.46830392 ... 0.49306852 0.4061299  0.46411484]
  [0.47598845 0.55510455 0.47800186 ... 0.49304226 0.40060428 0.4702602 ]
  [0.47095585 0.5524668  0.47349644 ... 0.49247035 0.41551554 0.4656694 ]
  ...
  [0.473594   0.5530096  0.48761848 ... 0.490785   0.40739113 0.47809464]
  [0.47385246 0.5515101  0.47469315 ... 0.48799852 0.40717867 0.47817177]
  [0.45735514 0.5545202  0.47318476 ... 0.48771787 0.41125482 0.45760006]]

 [[0.5257553  0.46964663 0.64925057 ... 0.5456276  0.62523377 0.49289626]
  [0.51869744 0.48040384 0.64571327 ... 0.53287935 0.6220018  0.50445914]
  [0.53110534 0.48170853 0.6410709  ... 0.5455302  0.6282068  0.4921141 ]
  ...
  [0.53149515 0.48453844 0.63667035 ... 0.52498525 0.6216751  0.5099271 ]
  [0.52042365 0.48426446 0.64478606 ... 0.5371342  0.6203686  0.5011124 ]
  [0.51877236 0.48158753 0.6400035  ... 0.5273335  0.62763    0.5056677 ]]

 [[0.5787959  0.49524885 0.5873417  ... 0.59600276 0.670287   0.5746271 ]
  [0.571639

In [2]:
from IPython import get_ipython
from IPython.display import display
# %%
from tensorflow import math, matmul, reshape, shape, transpose, cast, float32, concat
from tensorflow.keras.layers import Dense, Layer
from tensorflow.keras.backend import softmax
# Implementing the Scaled-Dot Product Attention

class DotProductAttention(Layer):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)

  def call(self, queries, keys, values, mask=None):
    d_k = queries.shape[-1]
    # Scoring the queries against the keys after transposing the latter, and scaling
    scores = matmul(queries, keys, transpose_b=True) / math.sqrt(cast(d_k, float32))
    # Apply mask to the attention scores
    if mask is not None:
      scores += -1e9 * mask
    # Computing the weights by a softmax operation
    weights = softmax(scores)
    # Computing the attention by a weighted sum of the value vectors
    return matmul(weights, values)
    # Implementing the Multi-Head Attention

class MultiHeadAttention(Layer):
  def __init__(self, h, d_k, d_v, d_model, **kwargs):
    super().__init__(**kwargs)
    self.attention = DotProductAttention() # Scaled dot product attention
    self.heads = h # Number of attention heads to use
    self.d_k = d_k # Dimensionality of the linearly projected queries and keys
    self.d_v = d_v # Dimensionality of the linearly projected values
    self.d_model = d_model # Dimensionality of the model
    self.W_q = Dense(d_k) # Learned projection matrix for the queries
    self.W_k = Dense(d_k) # Learned projection matrix for the keys
    self.W_v = Dense(d_v) # Learned projection matrix for the values
    self.W_o = Dense(d_model) # Learned projection matrix for the multi-head output

  def reshape_tensor(self, x, heads, flag):
    if flag:
      # Tensor shape after reshaping and transposing:
      # (batch_size, heads, seq_length, -1)
      x = reshape(x, shape=(shape(x)[0], shape(x)[1], heads, -1))
      x = transpose(x, perm=(0, 2, 1, 3))
    else:
        x = transpose(x, perm=(0, 2, 1, 3))
        x_shape = shape(x)
        new_shape = (x_shape[0], x_shape[1], x_shape[2] * x_shape[3])
        x = reshape(x, new_shape)

    return x

  def call(self, queries, keys, values, mask=None):
    # Rearrange the queries to be able to compute all heads in parallel
    q_reshaped = self.reshape_tensor(self.W_q(queries), self.heads, True)
    # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
    # Rearrange the keys to be able to compute all heads in parallel
    k_reshaped = self.reshape_tensor(self.W_k(keys), self.heads, True)
    # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
    # Rearrange the values to be able to compute all heads in parallel
    v_reshaped = self.reshape_tensor(self.W_v(values), self.heads, True)
    # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
    # Compute the multi-head attention output using the reshaped queries,
    # keys, and values
    o_reshaped = self.attention(q_reshaped, k_reshaped, v_reshaped, mask=mask)
    # Resulting tensor shape: (batch_size, heads, input_seq_length, -1)
    # Rearrange back the output into concatenated form
    output = self.reshape_tensor(o_reshaped, self.heads, False)
    # Resulting tensor shape: (batch_size, input_seq_length, d_model)
    return self.W_o(output)
# %%
from numpy import random
input_seq_length = 5 # Maximum length of the input sequence
h = 8 # Number of self-attention heads
d_k = 64 # Dimensionality of the linearly projected queries and keys
d_v = 64 # Dimensionality of the linearly projected values
d_model = 512 # Dimensionality of the model sub-layers' outputs
batch_size = 64 # Batch size from the training process
queries = random.random((batch_size, input_seq_length, d_k))
keys = random.random((batch_size, input_seq_length, d_k))
values = random.random((batch_size, input_seq_length, d_v))
multihead_attention = MultiHeadAttention(h, d_k, d_v, d_model)
print(multihead_attention(queries, keys, values))

tf.Tensor(
[[[-0.03918857  0.23965558  0.1576894  ... -0.09452695 -0.00175945
   -0.01606564]
  [-0.04583649  0.23292817  0.15202665 ... -0.09762093 -0.01820333
   -0.00944021]
  [-0.04153083  0.24064599  0.15618199 ... -0.09684861 -0.01875615
   -0.0104829 ]
  [-0.03141589  0.23563994  0.15689145 ... -0.08968071 -0.00576321
   -0.01493201]
  [-0.03767595  0.23066333  0.15056081 ... -0.09380836 -0.0031466
   -0.01511911]]

 [[-0.154156    0.297374    0.18527241 ... -0.16654359 -0.07911475
    0.09892107]
  [-0.15713435  0.2952929   0.18469529 ... -0.1649993  -0.07346685
    0.0963065 ]
  [-0.14870173  0.29774722  0.18671434 ... -0.17230955 -0.06972022
    0.10104506]
  [-0.14972018  0.28962058  0.17733829 ... -0.16820109 -0.07686087
    0.09654674]
  [-0.14786234  0.29783407  0.18618785 ... -0.15618616 -0.07385553
    0.09621389]]

 [[-0.11995585  0.25221106  0.25074473 ... -0.05550449 -0.07291637
    0.13295746]
  [-0.1236603   0.2379325   0.25063935 ... -0.06035953 -0.07826111
    0.

In [3]:
import tensorflow as tf

class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)  # Dense layer for the encoder hidden states
        self.W2 = tf.keras.layers.Dense(units)  # Dense layer for the decoder hidden state
        self.V = tf.keras.layers.Dense(1)       # Dense layer to compute alignment scores

    def call(self, query, values):
        """
        Args:
            query: Decoder hidden state (shape: [batch_size, hidden_size]).
            values: Encoder outputs (shape: [batch_size, seq_len, hidden_size]).
        Returns:
            context_vector: Weighted sum of encoder outputs (shape: [batch_size, hidden_size]).
            attention_weights: Attention weights (shape: [batch_size, seq_len]).
        """
        # Add time axis to query for broadcasting (shape: [batch_size, 1, hidden_size])
        query_with_time_axis = tf.expand_dims(query, 1)

        # Compute the alignment scores (shape: [batch_size, seq_len, 1])
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))

        # Remove the last axis (shape: [batch_size, seq_len])
        attention_weights = tf.nn.softmax(score, axis=1)

        # Compute the context vector as the weighted sum of values (shape: [batch_size, hidden_size])
        context_vector = tf.reduce_sum(attention_weights * values, axis=1)

        return context_vector, attention_weights

# Example usage
if __name__ == "__main__":
    # Define batch size, sequence length, and hidden size
    batch_size = 64
    seq_len = 10
    hidden_size = 256
    attention_units = 128

    # Instantiate the attention layer
    attention = BahdanauAttention(units=attention_units)

    # Simulated encoder outputs (values) and decoder hidden state (query)
    encoder_outputs = tf.random.normal([batch_size, seq_len, hidden_size])
    decoder_hidden_state = tf.random.normal([batch_size, hidden_size])

    # Apply the attention mechanism
    context_vector, attention_weights = attention(decoder_hidden_state, encoder_outputs)

    print("Context vector shape:", context_vector.shape)  # Expected: [batch_size, hidden_size]
    print("Attention weights shape:", attention_weights.shape)  # Expected: [batch_size, seq_len]


Context vector shape: (64, 256)
Attention weights shape: (64, 10, 1)


In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense

class LuongAttention(Layer):
    def __init__(self, attention_type, hidden_size):
        super(LuongAttention, self).__init__()
        self.attention_type = attention_type
        self.hidden_size = hidden_size

        if attention_type == "general":
            self.attention_weight = Dense(hidden_size)
        elif attention_type == "concat":
            self.attention_weight = Dense(hidden_size)
            self.v = tf.Variable(tf.random.normal([hidden_size]), trainable=True)

    def score(self, hidden, encoder_outputs):
        if self.attention_type == "dot":
            # Dot product between hidden state and encoder outputs
            return tf.matmul(encoder_outputs, tf.expand_dims(hidden, axis=-1))[:, :, 0]

        elif self.attention_type == "general":
            # Linear transformation followed by dot product
            energy = self.attention_weight(encoder_outputs)
            return tf.matmul(energy, tf.expand_dims(hidden, axis=-1))[:, :, 0]

        elif self.attention_type == "concat":
            # Concatenate hidden state with encoder outputs
            hidden_expanded = tf.expand_dims(hidden, axis=1)
            hidden_expanded = tf.tile(hidden_expanded, [1, tf.shape(encoder_outputs)[1], 1])
            concat_input = tf.concat([hidden_expanded, encoder_outputs], axis=-1)
            energy = tf.tanh(self.attention_weight(concat_input))
            return tf.reduce_sum(energy * self.v, axis=2)

        else:
            raise ValueError("Unknown attention type: {}".format(self.attention_type))

    def call(self, hidden, encoder_outputs):
        # Compute alignment scores
        alignment_scores = self.score(hidden, encoder_outputs)

        # Softmax normalization to obtain attention weights
        attention_weights = tf.nn.softmax(alignment_scores, axis=1)

        # Compute the context vector as the weighted sum of encoder outputs
        context_vector = tf.matmul(tf.expand_dims(attention_weights, axis=1), encoder_outputs)
        context_vector = tf.squeeze(context_vector, axis=1)

        return context_vector, attention_weights


# Example usage
if __name__ == "__main__":
    batch_size = 2
    seq_len = 5
    hidden_size = 10

    # Simulated inputs
    hidden = tf.random.normal([batch_size, hidden_size])  # Decoder hidden state
    encoder_outputs = tf.random.normal([batch_size, seq_len, hidden_size])  # Encoder outputs

    # Instantiate Luong Attention (dot, general, or concat)
    attention_type = "dot"  # Options: "dot", "general", "concat"
    attention_layer = LuongAttention(attention_type, hidden_size)

    # Forward pass
    context_vector, attention_weights = attention_layer(hidden, encoder_outputs)

    print("Context vector:", context_vector.numpy())
    print("Attention weights:", attention_weights.numpy())


Context vector: [[-0.0061667  -0.4700908   0.26643655  0.3190401  -0.32473612 -0.93865937
   0.49196798  0.90012634  0.26490387  0.10492379]
 [ 1.652067   -0.5170198   0.42716143  1.3397385   2.1753972   0.7432705
   1.1176186   0.86316746  0.6020574   1.107633  ]]
Attention weights: [[6.7430109e-02 1.8881023e-01 3.3759749e-01 3.9063013e-01 1.5531966e-02]
 [1.4719539e-04 1.3359219e-02 6.3651330e-03 8.2620652e-03 9.7186637e-01]]
