In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

## attention

In [None]:
def attention(self, query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [24]:
query = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
key = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
value = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [25]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [26]:
score = tf.matmul(query, key, transpose_b=True)
score.shape

TensorShape([8, 64, 64])

In [27]:
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
dim_key.numpy()

128.0

In [28]:
scaled_score = score / tf.math.sqrt(dim_key)
scaled_score.shape

TensorShape([8, 64, 64])

In [29]:
weights = tf.nn.softmax(scaled_score, axis=-1)
weights.shape

TensorShape([8, 64, 64])

In [30]:
output = tf.matmul(weights, value)
output.shape

TensorShape([8, 64, 128])

In [32]:
output.shape, weights.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 64]))

## separate_heads

In [None]:
def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [46]:
x = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
batch_size = 8
num_heads = 8
projection_dim = 16

In [47]:
x.shape

TensorShape([8, 64, 128])

In [48]:
x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
x.shape

TensorShape([8, 64, 8, 16])

In [50]:
tf.transpose(x, perm=[0, 2, 1, 3]).shape

TensorShape([8, 8, 64, 16])

## MultiHeadSelfAttention

In [51]:
def separate_heads(x, batch_size):
        x = tf.reshape(x, (batch_size, -1, num_heads, projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

In [54]:
def attention(query, key, value):
    score = tf.matmul(query, key, transpose_b=True)
    dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
    scaled_score = score / tf.math.sqrt(dim_key)
    weights = tf.nn.softmax(scaled_score, axis=-1)
    output = tf.matmul(weights, value)
    return output, weights

In [34]:
embed_dim = 128
num_heads = 8

In [37]:
projection_dim = embed_dim // num_heads
projection_dim

16

In [38]:
query_dense = layers.Dense(embed_dim)
key_dense = layers.Dense(embed_dim)
value_dense = layers.Dense(embed_dim)
combine_heads = layers.Dense(embed_dim)

In [41]:
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)

In [42]:
batch_size = tf.shape(inputs)[0]
query = query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = key_dense(inputs)  # (batch_size, seq_len, embed_dim)
value = value_dense(inputs) # (batch_size, seq_len, embed_dim)

In [44]:
query.shape, key.shape, value.shape

(TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]),
 TensorShape([8, 64, 128]))

In [52]:
query = separate_heads(query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
key = separate_heads(key, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
value = separate_heads(value, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)

In [53]:
query.shape, key.shape, value.shape

(TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]),
 TensorShape([8, 8, 64, 16]))

In [55]:
attention, weights = attention(query, key, value)

In [56]:
attention.shape, weights.shape

(TensorShape([8, 8, 64, 16]), TensorShape([8, 8, 64, 64]))

In [57]:
attention = tf.transpose(attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, projection_dim)
attention.shape

TensorShape([8, 64, 8, 16])

In [58]:
concat_attention = tf.reshape(attention, (batch_size, -1, embed_dim))  # (batch_size, seq_len, embed_dim)
concat_attention.shape

TensorShape([8, 64, 128])

In [59]:
output = combine_heads(concat_attention)  # (batch_size, seq_len, embed_dim)
output.shape

TensorShape([8, 64, 128])

## TransformerBlock

In [61]:
ff_dim = 32

In [62]:
ffn = keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),])

In [64]:
rate=0.1
layernorm1 = layers.LayerNormalization(epsilon=1e-6)
layernorm2 = layers.LayerNormalization(epsilon=1e-6)
dropout1 = layers.Dropout(rate)
dropout2 = layers.Dropout(rate)

In [67]:
attn_output = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
inputs = tf.convert_to_tensor(np.random.rand(8,64,128), dtype=tf.float32)
attn_output.shape, inputs.shape

(TensorShape([8, 64, 128]), TensorShape([8, 64, 128]))

In [68]:
attn_output = dropout1(attn_output)

In [71]:
out1 = layernorm1(inputs + attn_output)
out1.shape

TensorShape([8, 64, 128])

In [72]:
ffn_output = ffn(out1)
ffn_output.shape

TensorShape([8, 64, 128])

In [73]:
ffn_output = dropout2(ffn_output)

In [74]:
output_final = layernorm2(out1 + ffn_output)
output_final.shape

TensorShape([8, 64, 128])

## Main Function Development

In [2]:
import tensorflow as tf
import sys

In [3]:
sys.path.insert(0,'./model_X')

In [4]:
from transformers_utils import *

In [8]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
maxlen = 128
vocab_size = 1000
inputs = tf.keras.layers.Input(shape=(maxlen,))

In [9]:
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)(inputs)

In [12]:
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)(embedding_layer, True)

In [14]:
transformer_block

<tf.Tensor 'layer_normalization_3/Identity:0' shape=(None, 128, 32) dtype=float32>

In [16]:
x = tf.keras.layers.GlobalAveragePooling1D()(transformer_block)
x

<tf.Tensor 'global_average_pooling1d/Identity:0' shape=(None, 32) dtype=float32>

In [17]:
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(2, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

In [18]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_op_layer_Shape (TensorFlowOp [(2,)]               0           input_2[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [()]                 0           tf_op_layer_Shape[0][0]          
__________________________________________________________________________________________________
tf_op_layer_range (TensorFlowOp [(None,)]            0           tf_op_layer_strided_slice[0][0]  
______________________________________________________________________________________________

In [21]:
import numpy as np

In [26]:
inputs = tf.convert_to_tensor(np.random.rand(8,128), dtype=tf.float32)

In [27]:
model.predict(inputs, batch_size=8)

array([[0.5419823 , 0.45801768],
       [0.5396268 , 0.46037325],
       [0.556035  , 0.44396505],
       [0.55087   , 0.44912994],
       [0.550646  , 0.44935405],
       [0.55667895, 0.44332105],
       [0.5523735 , 0.4476265 ],
       [0.5457177 , 0.4542823 ]], dtype=float32)

In [40]:
from transformers import *

In [44]:
config = AlbertConfig.from_json_file('/Users/ankur.kumar/Downloads/base_2/assets/albert_config.json')

In [45]:
model = modeling_tf_albert.TFAlbertModel(config)

In [50]:
inputs = tf.convert_to_tensor(np.random.rand(64,128), dtype=tf.int32)

In [51]:
output = model(inputs)

In [55]:
output[0].shape, output[1].shape

(TensorShape([64, 128, 768]), TensorShape([64, 768]))

## Positional Embedding

In [2]:
import numpy as np

In [3]:
def GetPosEncodingMatrix(max_len, d_emb):
	pos_enc = np.array([
		[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] 
		if pos != 0 else np.zeros(d_emb) 
			for pos in range(max_len)
			])
	pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
	pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
	return pos_enc

In [7]:
max_len = 512
d_emb = 128
weights=GetPosEncodingMatrix(max_len, d_emb)

In [9]:
weights

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.41470985e-01,  5.40302306e-01,  7.61720408e-01, ...,
         9.99999991e-01,  1.15478198e-04,  9.99999993e-01],
       [ 9.09297427e-01, -4.16146837e-01,  9.87046251e-01, ...,
         9.99999964e-01,  2.30956395e-04,  9.99999973e-01],
       ...,
       [ 6.19504237e-02,  9.98079228e-01,  8.15081054e-01, ...,
         9.97697292e-01,  5.87445633e-02,  9.98273047e-01],
       [ 8.73326668e-01,  4.87135024e-01,  9.69396188e-01, ...,
         9.97688239e-01,  5.88598417e-02,  9.98266257e-01],
       [ 8.81770401e-01, -4.71678874e-01,  4.41073911e-01, ...,
         9.97679168e-01,  5.89751193e-02,  9.98259453e-01]])

In [10]:
import argparse

In [20]:
flag = argparse.Namespace(a='Ankur',
                         b='Ankur2')

## Per-training

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

In [2]:
tf.__version__

'2.2.0'

In [3]:
import sys
sys.path.insert(0,'model_X/')

In [4]:
ls

Local Development.ipynb  [1m[36mdist[m[m/                    setup.sh
Local Testing.ipynb      [1m[36mmodel_X[m[m/                 test_modelX.py
README.md                [1m[36mmodel_X.egg-info[m[m/        [1m[36mtmp[m[m/
[1m[36mbuild[m[m/                   setup.py


In [5]:
from pretraining_layers import *

In [6]:
from transformers_architectures import *
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model
import argparse

In [7]:
config = argparse.Namespace(vocab_size=1000,
                            embed_dim=512,
                            ff_dim=32,
                            num_heads=8,
                            rate=0.1,
                            maxlen=128,
                           initializer_range=0.2,
                           embedding_size=512,
                           max_predictions_per_seq=20,
                           hidden_act='relu')

In [8]:
inputs = tf.keras.layers.Input(shape=(config.maxlen,), dtype=tf.int32)
pooled_output,sequence_output = VanillaTransformer(config)(inputs)

In [9]:
masked_lm_positions = tf.keras.layers.Input(
      shape=(config.max_predictions_per_seq,),
      name='masked_lm_positions',
      dtype=tf.int32)
masked_lm_weights = tf.keras.layers.Input(
      shape=(config.max_predictions_per_seq,),
      name='masked_lm_weights',
      dtype=tf.int32)
masked_lm_ids = tf.keras.layers.Input(
      shape=(config.max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)

In [10]:
pooled_output, sequence_output

(<tf.Tensor 'pooler_transform/Identity:0' shape=(None, 512) dtype=float32>,
 <tf.Tensor 'layer_normalization_1/Identity:0' shape=(None, 128, 512) dtype=float32>)

In [11]:
transformer_submodel = tf.keras.Model(
      inputs=inputs,
      outputs=[pooled_output, sequence_output])

In [12]:
mask_lm_input_tensor = gather_indexes(sequence_output, masked_lm_positions)
mask_lm_input_tensor

<tf.Tensor 'GatherV2:0' shape=(None, 512) dtype=float32>

In [13]:
lm_output = tf.keras.layers.Dense(config.embedding_size)(mask_lm_input_tensor)
lm_output

<tf.Tensor 'dense_6/Identity:0' shape=(None, 512) dtype=float32>

In [14]:
embedding_table = transformer_submodel.get_layer('embedding').embeddings
lm_output = tf.matmul(lm_output, embedding_table, transpose_b=True)
lm_output

<tf.Tensor 'MatMul:0' shape=(None, 1000) dtype=float32>

In [15]:
lm_output = tf.nn.log_softmax(lm_output, axis=-1)
lm_output

<tf.Tensor 'LogSoftmax:0' shape=(None, 1000) dtype=float32>

In [16]:
pretraining_model = tf.keras.Model([inputs, masked_lm_positions], lm_output)

In [17]:
pretraining_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_op_layer_Shape (TensorFlowOp [(2,)]               0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [()]                 0           tf_op_layer_Shape[0][0]          
__________________________________________________________________________________________________
tf_op_layer_Range (TensorFlowOp [(None,)]            0           tf_op_layer_strided_slice[0][0]  
____________________________________________________________________________________________

### Method 1:

In [18]:
def loss_fn(lm_label_ids, lm_output):
    lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
    lm_label_ids = tf.keras.backend.cast(lm_label_ids, dtype=tf.int32)
    lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids,1000)
    lm_per_example_loss = -tf.keras.backend.sum(lm_output * lm_label_ids_one_hot, axis=[-1])
    lm_example_loss = tf.reshape(lm_per_example_loss, [-1])
    return tf.reduce_mean(lm_example_loss)

In [19]:
pretraining_model.compile(optimizer='adam',loss=loss_fn)

In [35]:
X = {
  'input_1': tf.convert_to_tensor(np.random.randint(1,100,(64,128)), tf.int32),
  'masked_lm_positions': tf.convert_to_tensor(np.random.randint(1,100,(64,20)),tf.int32)}
y = {
  'tf_op_layer_LogSoftmax': tf.convert_to_tensor(np.random.randint(1,1000,(64,20)),tf.int32),
  'masked_lm_weights': tf.convert_to_tensor(np.random.randint(1,1000,(64,20)),tf.int32)}

In [40]:
pretraining_model.fit(X,y, batch_size=8, epochs=3, use_multiprocessing=True)

Train on 64 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x14839f1d0>

### Method 2:

In [36]:
def loss_fn(lm_label_ids, lm_label_weights, lm_output):
    lm_label_ids = tf.keras.backend.reshape(lm_label_ids, [-1])
    lm_label_ids = tf.keras.backend.cast(lm_label_ids, dtype=tf.int32)
    lm_label_ids_one_hot = tf.keras.backend.one_hot(lm_label_ids,1000)
    lm_per_example_loss = -tf.keras.backend.sum(lm_output * lm_label_ids_one_hot, axis=[-1])
    lm_example_loss = tf.reshape(lm_per_example_loss, [-1])
    
    lm_label_weights = tf.keras.backend.cast(lm_label_weights, tf.float32)
    lm_label_weights = tf.keras.backend.reshape(lm_label_weights, [-1])
    
    return tf.reduce_mean(lm_example_loss * lm_label_weights)

In [37]:
train_dataset = tf.data.Dataset.from_tensor_slices(((X['input_1'],X['masked_lm_positions']), (y['tf_op_layer_LogSoftmax'],y['masked_lm_weights'])))
train_dataset = train_dataset.shuffle(buffer_size=10).batch(8)

In [38]:
optimizer = tf.keras.optimizers.Adam()

In [40]:
epochs = 2
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    # Iterate over the batches of the dataset.
    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):

        # Open a GradientTape to record the operations run
        # during the forward pass, which enables autodifferentiation.
        with tf.GradientTape() as tape:

            # Run the forward pass of the layer.
            # The operations that the layer applies
            # to its inputs are going to be recorded
            # on the GradientTape.
            logits = pretraining_model(x_batch_train, training=True)  # Logits for this minibatch

            # Compute the loss value for this minibatch.
            loss_value = loss_fn(y_batch_train[0],y_batch_train[1], logits)

        # Use the gradient tape to automatically retrieve
        # the gradients of the trainable variables with respect to the loss.
        print(loss_value)
        grads = tape.gradient(loss_value, pretraining_model.trainable_weights)

        # Run one step of gradient descent by updating
        # the value of the variables to minimize the loss.
        optimizer.apply_gradients(zip(grads, pretraining_model.trainable_weights))

        # Log every 200 batches.
        if step % 2 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %s samples" % ((step + 1) * 8))


Start of epoch 0
tf.Tensor(3524.2505, shape=(), dtype=float32)
Training loss (for one batch) at step 0: 3524.2505
Seen so far: 8 samples
tf.Tensor(3410.2292, shape=(), dtype=float32)
tf.Tensor(3547.8665, shape=(), dtype=float32)
Training loss (for one batch) at step 2: 3547.8665
Seen so far: 24 samples
tf.Tensor(3537.9368, shape=(), dtype=float32)
tf.Tensor(3668.4062, shape=(), dtype=float32)
Training loss (for one batch) at step 4: 3668.4062
Seen so far: 40 samples
tf.Tensor(3395.939, shape=(), dtype=float32)
tf.Tensor(3257.4817, shape=(), dtype=float32)
Training loss (for one batch) at step 6: 3257.4817
Seen so far: 56 samples
tf.Tensor(3323.4507, shape=(), dtype=float32)

Start of epoch 1
tf.Tensor(3603.69, shape=(), dtype=float32)
Training loss (for one batch) at step 0: 3603.6899
Seen so far: 8 samples
tf.Tensor(3347.3254, shape=(), dtype=float32)
tf.Tensor(3446.4949, shape=(), dtype=float32)
Training loss (for one batch) at step 2: 3446.4949
Seen so far: 24 samples
tf.Tensor(369