In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

## attention

In [None]:
def attention(self, query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [24]:
query = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
key = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
value = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [25]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [26]:
score = tf.matmul(query, key, transpose_b=True)
score.shape

TensorShape([8, 64, 64])

In [27]:
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
dim_key.numpy()

128.0

In [28]:
scaled_score = score / tf.math.sqrt(dim_key)
scaled_score.shape

TensorShape([8, 64, 64])

In [29]:
weights = tf.nn.softmax(scaled_score, axis=-1)
weights.shape

TensorShape([8, 64, 64])

In [30]:
output = tf.matmul(weights, value)
output.shape

TensorShape([8, 64, 128])

In [32]:
output.shape, weights.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 64]))

## separate_heads

In [None]:
def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [46]:
x = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
batch_size = 8
num_heads = 8
projection_dim = 16

In [47]:
x.shape

TensorShape([8, 64, 128])

In [48]:
x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
x.shape

TensorShape([8, 64, 8, 16])

In [50]:
tf.transpose(x, perm=[0, 2, 1, 3]).shape

TensorShape([8, 8, 64, 16])

## MultiHeadSelfAttention

In [51]:
def separate_heads(x, batch_size):
        x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [54]:
def attention(query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [34]:
embed_dim = 128
num_heads = 8

In [37]:
projection_dim = embed_dim // num_heads
projection_dim

16

In [38]:
query_dense = layers.Dense(embed_dim)
key_dense = layers.Dense(embed_dim)
value_dense = layers.Dense(embed_dim)
combine_heads = layers.Dense(embed_dim)

In [41]:
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [42]:
batch_size = tf.shape(inputs)[0]
query = query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = key_dense(inputs)  # (batch_size, seq_len, embed_dim)
value = value_dense(inputs) # (batch_size, seq_len, embed_dim)

In [44]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [52]:
query = separate_heads(query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
key = separate_heads(key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
value = separate_heads(value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)

In [53]:
query.shape, key.shape, value.shape

(TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]))

In [55]:
attention, weights = attention(query, key, value)

In [56]:
attention.shape, weights.shape

(TensorShape([8, 8, 64, 16]), TensorShape([8, 8, 64, 64]))

In [57]:
attention = tf.transpose(attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, projection_dim)
attention.shape

TensorShape([8, 64, 8, 16])

In [58]:
concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))  # (batch_size, seq_len, embed_dim)
concat_attention.shape

TensorShape([8, 64, 128])

In [59]:
output = combine_heads(concat_attention)  # (batch_size, seq_len, embed_dim)
output.shape

TensorShape([8, 64, 128])

## TransformerBlock

In [61]:
ff_dim = 32

In [62]:
ffn = keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),])

In [64]:
rate=0.1
layernorm1 = layers.LayerNormalization(epsilon=1e-6)
layernorm2 = layers.LayerNormalization(epsilon=1e-6)
dropout1 = layers.Dropout(rate)
dropout2 = layers.Dropout(rate)

In [67]:
attn_output = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
attn_output.shape, inputs.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 128]))

In [68]:
attn_output = dropout1(attn_output)

In [71]:
out1 = layernorm1(inputs + attn_output)
out1.shape

TensorShape([8, 64, 128])

In [72]:
ffn_output = ffn(out1)
ffn_output.shape

TensorShape([8, 64, 128])

In [73]:
ffn_output = dropout2(ffn_output)

In [74]:
output_final = layernorm2(out1 + ffn_output)
output_final.shape

TensorShape([8, 64, 128])

## Main Function Development

In [2]:
import tensorflow as tf
import sys

In [3]:
sys.path.insert(0,'./model_X')

In [4]:
from transformers_utils import *

In [8]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
maxlen = 128
vocab_size = 1000
inputs = tf.keras.layers.Input(shape=(maxlen,))

In [9]:
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)

In [12]:
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer, True)

In [14]:
transformer_block

<tf.Tensor 'layer_normalization_3/Identity:0' shape=(None, 128, 32) dtype=float32>

In [16]:
x = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
x

<tf.Tensor 'global_average_pooling1d/Identity:0' shape=(None, 32) dtype=float32>

In [17]:
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(2, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_op_layer_Shape (TensorFlowOp [(2,)]               0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [()]                 0           tf_op_layer_Shape[0][0]          
__________________________________________________________________________________________________
tf_op_layer_range (TensorFlowOp [(None,)]            0           tf_op_layer_strided_slice[0][0]  
______________________________________________________________________________________________

In [21]:
import numpy as np

In [26]:
inputs = tf.convert_to_tensor(np.random.rand(8,128), dtype=tf.float32)

In [27]:
model.predict(inputs, batch_size=8)

array([[0.5419823 , 0.45801768],
       [0.5396268 , 0.46037325],
       [0.556035  , 0.44396505],
       [0.55087   , 0.44912994],
       [0.550646  , 0.44935405],
       [0.55667895, 0.44332105],
       [0.5523735 , 0.4476265 ],
       [0.5457177 , 0.4542823 ]], dtype=float32)

In [40]:
from transformers import *

In [44]:
config = AlbertConfig.from_json_file('/Users/ankur.kumar/Downloads/base_2/assets/albert_config.json')

In [45]:
model = modeling_tf_albert.TFAlbertModel(config)

In [50]:
inputs = tf.convert_to_tensor(np.random.rand(64,128), dtype=tf.int32)

In [51]:
output = model(inputs)

In [55]:
output[0].shape, output[1].shape

(TensorShape([64, 128, 768]), TensorShape([64, 768]))

## Positional Embedding

In [2]:
import numpy as np

In [3]:
def GetPosEncodingMatrix(max_len, d_emb):
	pos_enc = np.array([
		[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] 
		if pos != 0 else np.zeros(d_emb) 
			for pos in range(max_len)
			])
	pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
	pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
	return pos_enc

In [7]:
max_len = 512
d_emb = 128
weights=GetPosEncodingMatrix(max_len, d_emb)

In [9]:
weights

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.41470985e-01,  5.40302306e-01,  7.61720408e-01, ...,
         9.99999991e-01,  1.15478198e-04,  9.99999993e-01],
       [ 9.09297427e-01, -4.16146837e-01,  9.87046251e-01, ...,
         9.99999964e-01,  2.30956395e-04,  9.99999973e-01],
       ...,
       [ 6.19504237e-02,  9.98079228e-01,  8.15081054e-01, ...,
         9.97697292e-01,  5.87445633e-02,  9.98273047e-01],
       [ 8.73326668e-01,  4.87135024e-01,  9.69396188e-01, ...,
         9.97688239e-01,  5.88598417e-02,  9.98266257e-01],
       [ 8.81770401e-01, -4.71678874e-01,  4.41073911e-01, ...,
         9.97679168e-01,  5.89751193e-02,  9.98259453e-01]])

In [10]:
import argparse

In [20]:
flag = argparse.Namespace(a='Ankur',
                         b='Ankur2')