In [2]:
import tensorflow as tf
import numpy as np
import tensorflow_datasets as tfds
import tensorflow_text

In [3]:
def positional_encoding(length, depth):
    depth = depth / 2
    positions = np.arange(length)[:, np.newaxis]
    depths = np.arange(depth)[np.newaxis:, ] / depth
    angle_rads = (positions / (10000 ** depths))
    pos_enc = np.concatenate([np.sin(angle_rads), np.cos(angle_rads)], axis = -1)
    return tf.cast(pos_enc, dtype = tf.float32)

In [4]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.pos_enc = positional_encoding(2048, d_model)
        
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
    
    def call(self, X):
        length = tf.shape(X)[1]
        X = self.embedding(X)
        X *= tf.math.sqrt(tf.cast(self.d_model, dtype = tf.float32))
        return X + self.pos_enc[tf.newaxis, :length, :]

In [5]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [6]:
class GlobalSelfAttention(BaseAttention):
    def call(self, X):
        self_attention = self.mha(query = X, value = X, key = X)
        out = self.add([X, self_attention])
        return self.layer_norm(out)

In [7]:
class CrossAttention(BaseAttention):
    def call(self, X, Context):
        cross_attention = self.mha(query = X, key = Context, value = Context)
        out = self.add([X, cross_attention])
        return self.layer_norm(out)

In [8]:
class CasualSelfAttention(BaseAttention):
    def call(self, X):
        masked_attention = self.mha(query = X, key = X, value = X, use_causal_mask = True)
        out = self.add([X, masked_attention])
        return self.layer_norm(out)

In [9]:
class FeedForward(tf.keras.layers.Layer):
    def __init__(self, units, d_model, dropout_rate = 0.1):
        super().__init__()
        self.d_model = d_model
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(units, activation = 'relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate)
        ])
        self.layer_norm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()
    def call(self, X):
        X_ = self.seq(X)
        out = self.add([X, X_])
        return self.layer_norm(out)

In [10]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, heads, units, d_model, dropout_rate):
        super().__init__()
        self.gsa = GlobalSelfAttention(num_heads = heads, key_dim = d_model, dropout = dropout_rate)
        self.feed_forward = FeedForward(units, d_model, dropout_rate = 0.1)
    def call(self, X):
        out = self.gsa(X)
        return self.feed_forward(out)

In [11]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, vocab_size, heads, units, d_model, dropout_rate):
        super().__init__()
        self.num_layers = num_layers
        self.pos_emb = PositionalEmbedding(vocab_size, d_model)
        self.enc_seq = [EncoderLayer(heads, units, d_model, dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    def call(self, X):
        out = self.pos_emb(X)
        out = self.dropout(out)
        for i in range(self.num_layers):
            out = self.enc_seq[i](out)
        return out

In [12]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, heads, units, d_model, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.csa = CasualSelfAttention(num_heads = heads, key_dim = d_model, dropout = dropout_rate)
        self.ca = CrossAttention(num_heads = heads, key_dim = d_model, dropout = dropout_rate)
        self.feed_forward = FeedForward(units, d_model, dropout_rate = 0.1)
    def call(self, X, Context):
        out_1 = self.csa(X)
        out_2 = self.ca(out_1, Context)
        return self.feed_forward(out_2)

In [13]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, vocab_size, heads, units, d_model, dropout_rate):
        super(Decoder, self).__init__()
        self.num_layers = num_layers
        self.pos_emb = PositionalEmbedding(vocab_size, d_model)
        self.dec_seq = [DecoderLayer(heads, units, d_model, dropout_rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    def call(self, X, Context):
        out = self.pos_emb(X)
        out = self.dropout(out)
        for i in range(self.num_layers):
            out = self.dec_seq[i](out, Context)
        return out

In [14]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, vocab_size, target_vocab_size, heads, units, d_model, dropout_rate):
        super().__init__()
        self.enc = Encoder(num_layers, vocab_size, heads, units, d_model, dropout_rate)
        self.dec = Decoder(num_layers, vocab_size, heads, units, d_model, dropout_rate)
        self.final = tf.keras.layers.Dense(target_vocab_size)
    def call(self, inputs):
        Context, X = inputs
        out_1 = self.enc(Context)
        out_2 = self.dec(X, out_1)
        out_3 = self.final(out_2)
        try:
            del out_3._keras_mask
        except AttributeError:
            pass
        return out_3

In [15]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super().__init__()
        self.d_model = d_model
        self.d_model = tf.cast(self.d_model, tf.float32)
        self.warmup_steps = warmup_steps
    def __call__(self, step):
        step = tf.cast(step, dtype=tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(D_MODEL)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1 = 0.9, beta_2 = 0.98,
                                     epsilon = 1e-9)

In [17]:
def masked_loss(label, pred):
    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction='none')
    loss = loss_object(label, pred)
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss
def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred
    mask = label != 0
    match = match & mask
    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [None]:
Transformer = Transformer(
    NUM_LAYERS, VOCAB_SIZE, VOCAB_SIZE, NUM_HEADS, UNITS, D_MODEL, DROPOUT
)

In [None]:
Transformer.compile(loss = masked_loss, optimizer = optimizer, metrics = [masked_accuracy])

In [None]:
Transformer.fit(# dataset
    , epochs = 20)

In [18]:
class Translator(tf.Module):
    def __init__(self, tokenizer, transformer):
        self.tokenizer = tokenizer
        self.transformer = transformer
    def __call__(self, sentence, START_TOKEN, END_TOKEN, MAX_LENGTH):
        sentence = preprocess_sentence(sentence) #Preprocess the input sentence
        sentence = START_TOKEN + self.tokenizer.encode(sentence) + END_TOKEN #Add start and end tokens
        sentence = tf.convert_to_tensor(sentence, dtype = tf.int32) 
        sentence = sentence[np.newaxis, :]
        output_array = tf.TensorArray(dtype = tf.int64, size = 0, dynamic_size = True)
        output_array = output_array.write(0, START_TOKEN)
        for i in tf.range(MAX_LENGTH):
            output = tf.transpose(output_array.stack())
            prediction = self.transformer([sentence, output], training = False)
            pred_index = prediction[:, -1:, :]
            pred_id = tf.argmax(pred_index, axis = -1)
            output_array.write(i + 1, pred_id[0])
            if pred_id == END_TOKEN:
                break
        output = tf.transpose(output_array.stack())
        output = output.numpy()
        output = output.reshape(output.shape[1], )
        output = output[1: len(output) - 1]
        output = tokenizer.decode(output) #Detokenize the output
        output_array.close()
        return output

In [None]:
Translator = Translator(tokenizer, Transformer)

In [None]:
output = Translator('I am fine. What about you?', START_TOKEN, END_TOKEN, MAX_LENGTH)
print(output)