In [2]:
import tensorflow as tf
import numpy as np

In [6]:
np.random.rand(1000,100).shape

(1000, 100)

In [16]:
float_type = tf.float32

embedding_table = tf.convert_to_tensor(np.random.rand(1000,100), dtype=float_type)
num_next_sentence_label = 2
initializer = tf.keras.initializers.TruncatedNormal(
      stddev=0.02)

In [8]:
lm_dense = tf.keras.layers.Dense(
        100,
        activation='sigmoid',
        kernel_initializer=initializer)
lm_layer_norm = tf.keras.layers.LayerNormalization(
        axis=-1, epsilon=1e-12)

In [9]:
mask_lm_input_tensor = np.random.rand(128,100)

In [10]:
lm_output = lm_dense(mask_lm_input_tensor)
lm_output = lm_layer_norm(lm_output)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [12]:
lm_output.shape

TensorShape([128, 100])

In [17]:
lm_output = tf.matmul(lm_output, embedding_table, transpose_b=True)
lm_output.shape

TensorShape([128, 1000])

In [18]:
lm_output = tf.nn.log_softmax(lm_output, axis=-1)
lm_output.shape

TensorShape([128, 1000])

## Transformer Network

In [1]:
import tensorflow as tf
import numpy as np

In [3]:
tf.keras.layers.Dense(10, name='dense')

<tensorflow.python.keras.layers.core.Dense at 0x144294250>

In [4]:
class MultiHeadSelfAttention(tf.keras.layers.Layer):
    def __init__(self, config):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = config.embed_dim
        self.num_heads = config.num_heads
        if self.embed_dim % self.num_heads != 0:
            raise ValueError(
                f"embedding dimension = {self.embed_dim} should be divisible by number of heads = {self.num_heads}"
            )
        self.projection_dim = self.embed_dim // self.num_heads
        self.query_dense = tf.keras.layers.Dense(self.embed_dim, name='query')
        self.key_dense = tf.keras.layers.Dense(self.embed_dim, name='key')
        self.value_dense = tf.keras.layers.Dense(self.embed_dim, name='value')
        self.combine_heads = tf.keras.layers.Dense(self.embed_dim, name='combine')

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def __call__(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, config):#embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(config)
        self.ffn = tf.keras.Sequential(
            [tf.keras.layers.Dense(config.ff_dim, activation="relu"), tf.keras.layers.Dense(config.embed_dim),]
        )
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = tf.keras.layers.Dropout(config.rate)
        self.dropout2 = tf.keras.layers.Dropout(config.rate)

    def __call__(self, inputs):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(tf.keras.layers.Layer):
    def __init__(self, config, embedding_matrix, is_embedding_trainable, is_position_embedding_trainable):#maxlen, vocab_size, emded_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = tf.keras.layers.Embedding(input_dim=config.vocab_size, output_dim=config.embed_dim, trainable=is_embedding_trainable, weights=[embedding_matrix], name='word_embedding')
        self.pos_emb = tf.keras.layers.Embedding(input_dim=config.maxlen, output_dim=config.embed_dim, trainable=is_position_embedding_trainable, weights=[get_pos_encoding_matrix(config.maxlen, config.embed_dim)], name='position_embedding')

    def __call__(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def get_pos_encoding_matrix(max_len, d_emb):
	pos_enc = np.array([
		[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] 
		if pos != 0 else np.zeros(d_emb) 
			for pos in range(max_len)
			])
	pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
	pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
	return pos_enc

In [5]:
class VanillaTransformer():
    def __init__(self, config, embedding_matrix=None, is_embedding_trainable=False, is_position_embedding_trainable=False):
        if embedding_matrix is None:
            embedding_matrix = np.zeros((config.vocab_size, config.embed_dim))

        self.config = config
        self.embedding_matrix = embedding_matrix
        self.is_embedding_trainable = is_embedding_trainable
        self.is_position_embedding_trainable = is_position_embedding_trainable

        self.pooler_transform = tf.keras.layers.Dense(
                                units=self.config.embed_dim,
                                activation="tanh",
                                name="pooler_transform")

    def __call__(self, pre_layer):
        embedding_layer = TokenAndPositionEmbedding(self.config, self.embedding_matrix, self.is_embedding_trainable, self.is_position_embedding_trainable)(pre_layer)
        sequence_output = TransformerBlock(self.config)(embedding_layer)
        first_token_tensor = tf.squeeze(sequence_output[:, 0:1, :], axis=1)
        pooled_output = self.pooler_transform(first_token_tensor)
        return (pooled_output,sequence_output)

In [6]:
import argparse
config = argparse.Namespace(vocab_size=1000,
                            embed_dim=512,
                            ff_dim=32,
                            num_heads=8,
                            rate=0.1,
                            maxlen=128)

inputs = tf.keras.layers.Input(shape=(config.maxlen,))
pooled_output,sequence_output = VanillaTransformer(config)(inputs)
output = tf.keras.layers.Dense(3, activation='softmax')(pooled_output)
full_model = tf.keras.models.Model(inputs=inputs, outputs=output)
print(full_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_op_layer_Shape (TensorFlowOp [(2,)]               0           input_1[0][0]                    
__________________________________________________________________________________________________
tf_op_layer_strided_slice (Tens [()]                 0           tf_op_layer_Shape[0][0]          
__________________________________________________________________________________________________
tf_op_layer_range (TensorFlowOp [(None,)]            0           tf_op_layer_strided_slice[0][0]  
______________________________________________________________________________________________

In [7]:
full_model.save('transformer.h5')

ValueError: Unable to create group (name already exists)