In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

## attention

In [None]:
def attention(self, query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [24]:
query = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
key = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
value = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [25]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [26]:
score = tf.matmul(query, key, transpose_b=True)
score.shape

TensorShape([8, 64, 64])

In [27]:
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
dim_key.numpy()

128.0

In [28]:
scaled_score = score / tf.math.sqrt(dim_key)
scaled_score.shape

TensorShape([8, 64, 64])

In [29]:
weights = tf.nn.softmax(scaled_score, axis=-1)
weights.shape

TensorShape([8, 64, 64])

In [30]:
output = tf.matmul(weights, value)
output.shape

TensorShape([8, 64, 128])

In [32]:
output.shape, weights.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 64]))

## separate_heads

In [None]:
def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [46]:
x = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
batch_size = 8
num_heads = 8
projection_dim = 16

In [47]:
x.shape

TensorShape([8, 64, 128])

In [48]:
x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
x.shape

TensorShape([8, 64, 8, 16])

In [50]:
tf.transpose(x, perm=[0, 2, 1, 3]).shape

TensorShape([8, 8, 64, 16])

## MultiHeadSelfAttention

In [51]:
def separate_heads(x, batch_size):
        x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [54]:
def attention(query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [34]:
embed_dim = 128
num_heads = 8

In [37]:
projection_dim = embed_dim // num_heads
projection_dim

16

In [38]:
query_dense = layers.Dense(embed_dim)
key_dense = layers.Dense(embed_dim)
value_dense = layers.Dense(embed_dim)
combine_heads = layers.Dense(embed_dim)

In [41]:
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [42]:
batch_size = tf.shape(inputs)[0]
query = query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = key_dense(inputs)  # (batch_size, seq_len, embed_dim)
value = value_dense(inputs) # (batch_size, seq_len, embed_dim)

In [44]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [52]:
query = separate_heads(query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
key = separate_heads(key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
value = separate_heads(value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)

In [53]:
query.shape, key.shape, value.shape

(TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]))

In [55]:
attention, weights = attention(query, key, value)

In [56]:
attention.shape, weights.shape

(TensorShape([8, 8, 64, 16]), TensorShape([8, 8, 64, 64]))

In [57]:
attention = tf.transpose(attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, projection_dim)
attention.shape

TensorShape([8, 64, 8, 16])

In [58]:
concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))  # (batch_size, seq_len, embed_dim)
concat_attention.shape

TensorShape([8, 64, 128])

In [59]:
output = combine_heads(concat_attention)  # (batch_size, seq_len, embed_dim)
output.shape

TensorShape([8, 64, 128])

## TransformerBlock

In [61]:
ff_dim = 32

In [62]:
ffn = keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),])

In [64]:
rate=0.1
layernorm1 = layers.LayerNormalization(epsilon=1e-6)
layernorm2 = layers.LayerNormalization(epsilon=1e-6)
dropout1 = layers.Dropout(rate)
dropout2 = layers.Dropout(rate)

In [67]:
attn_output = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
attn_output.shape, inputs.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 128]))

In [68]:
attn_output = dropout1(attn_output)

In [71]:
out1 = layernorm1(inputs + attn_output)
out1.shape

TensorShape([8, 64, 128])

In [72]:
ffn_output = ffn(out1)
ffn_output.shape

TensorShape([8, 64, 128])

In [73]:
ffn_output = dropout2(ffn_output)

In [74]:
output_final = layernorm2(out1 + ffn_output)
output_final.shape

TensorShape([8, 64, 128])