In [2]:
!pip install -q dlai-grader==1.20.0 tensorflow-text==2.17.0 tensorflow==2.17.0 numpy==1.26.4

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m601.3/601.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.2/154.2 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Setting this env variable prevents TF warnings from showing up

import numpy as np
import tensorflow as tf
from collections import Counter

from utils import (train_data, val_data, portuguese_vectorizer,
                         english_vectorizer, masked_loss, masked_acc, tokens_to_text)

#Data Preparation

In [5]:
# StringLookup used to get word from ids and vice versa
id_to_word = tf.keras.layers.StringLookup(
    vocabulary = portuguese_vectorizer.get_vocabulary(),
    mask_token = "",
    oov_token = "[UNK]",
    invert = True)

In [6]:
for (trans_sample_cont, trans_sample_pre), trans_sample_post in train_data.take(1):

    print(f"Tokenized english sentence:\n{trans_sample_cont[0, :].numpy()}\n\n")

    print(f"Tokenized portuguese sentence (shifted to the right):\n{trans_sample_pre[0, :].numpy()}\n\n")

    print(f"Tokenized portuguese sentence:\n{trans_sample_post[0, :].numpy()}\n\n")

Tokenized english sentence:
[  2  13 300  59 130   8   7   9 952   4   3   0   0]


Tokenized portuguese sentence (shifted to the right):
[   2  237  243   47   57  299   35 1024    4    0    0    0]


Tokenized portuguese sentence:
[ 237  243   47   57  299   35 1024    4    3    0    0    0]




#NMT model with attention

In [7]:
class Encoder(tf.keras.layers.Layer):

    def __init__(self, vocab_size, units):

        super().__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, units, mask_zero = True)
        self.lstm = tf.keras.layers.LSTM(units = units, return_sequences = True)
        self.rnn = tf.keras.layers.Bidirectional(layer = self.lstm, merge_mode = 'sum')

    def call(self, context):

        x = self.embedding(context)
        x = self.rnn(x)

        return x

In [8]:
units = 256
vocab_size = english_vectorizer.vocabulary_size()

In [9]:
encoder = Encoder(vocab_size, units)
encoder_out = encoder(trans_sample_cont)

print(f'Encoder output has shape: {encoder_out.shape}') # (batch_size , seq_len , hidden_units)



Encoder output has shape: (64, 13, 256)


**A couple of things to notice:**


*   You need a way to pass both the output of the attention alongside the shifted-to-the-right translation (since this cross attention happens in the decoder side). For this you will use an Add layer so that the original dimension is preserved, which would not happen if you use something like a Concatenate layer.
*   Layer normalization is also performed for better stability of the network by using a LayerNormalization layer.

In [10]:
class CrossAttention(tf.keras.layers.Layer):

    def __init__(self, units):

        super().__init__()

        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads = 1,
            key_dim = units)

        self.layernorm = tf.keras.layers.LayerNormalization()

        self.add = tf.keras.layers.Add()

    def call(self, context, target):

        attn_output = self.mha(value = context, query = target)

        x = self.add([target, attn_output])

        x =  self.layernorm(x)

        return x

In [11]:
attention_layer = CrossAttention(units)
trans_sample_pre_embed = tf.keras.layers.Embedding(vocab_size, output_dim=units, mask_zero=True)(trans_sample_pre)

attention_result = attention_layer(encoder_out, trans_sample_pre_embed)



In [12]:
print(f'Tensor of contexts has shape: {encoder_out.shape}')
print(f'Tensor of translations has shape: {trans_sample_pre_embed.shape}')
print(f'Tensor of attention scores has shape: {attention_result.shape}')

Tensor of contexts has shape: (64, 13, 256)
Tensor of translations has shape: (64, 12, 256)
Tensor of attention scores has shape: (64, 12, 256)


In [13]:
class Decoder(tf.keras.layers.Layer):

    def __init__(self, vocab_size, units):

        super().__init__()

        self.embedding = tf.keras.layers.Embedding(
            vocab_size, units, mask_zero = True
        )

        self.pre_attention_rnn = tf.keras.layers.LSTM(
            units,
            return_sequences = True,
            return_state = True
        )

        self.attention = CrossAttention(units)

        self.post_attention_rnn = tf.keras.layers.LSTM(
            units,
            return_sequences = True
        )

        self.output_layer = tf.keras.layers.Dense(
            vocab_size,
            activation = tf.nn.log_softmax
        )

    def call(self, context, target, state=None, return_state=False):

        x = self.embedding(target)

        x, hidden_states, cell_states = self.pre_attention_rnn(x, initial_state = state)

        x = self.attention(context, x)

        x = self.post_attention_rnn(x)

        logits = self.output_layer(x)

        if return_state:
            return  logits, [hidden_states, cell_states]

        return logits

In [14]:
decoder = Decoder(vocab_size, units)
logits = decoder(context = encoder_out, target = trans_sample_pre)



In [15]:
print(f'Tensor of contexts has shape: {encoder_out.shape}')
print(f'Tensor of right-shifted translations has shape: {trans_sample_pre.shape}')
print(f'Tensor of logits has shape: {logits.shape}')

Tensor of contexts has shape: (64, 13, 256)
Tensor of right-shifted translations has shape: (64, 12)
Tensor of logits has shape: (64, 12, 12000)


In [16]:
class Translator(tf.keras.Model):

    def __init__(self, vocab_size, units):

        super().__init__()

        self.encoder = Encoder(vocab_size, units)

        self.decoder = Decoder(vocab_size, units)

    def call(self, inputs):

        context, target = inputs

        encoded_context = self.encoder(context)

        logits = self.decoder(encoded_context, target)

        return logits

In [17]:
translator = Translator(vocab_size, units)

logits = translator((trans_sample_cont, trans_sample_pre))

print(f'Tensor of sentences to translate has shape: {trans_sample_cont.shape}')
print(f'Tensor of right-shifted translations has shape: {trans_sample_pre.shape}')
print(f'Tensor of logits has shape: {logits.shape}')



Tensor of sentences to translate has shape: (64, 13)
Tensor of right-shifted translations has shape: (64, 12)
Tensor of logits has shape: (64, 12, 12000)




#Training

In [18]:
def compile_and_train(model, train_data, validation_data, optimizer, loss_function, metrics: list,
                      epochs = 20, steps_per_epoch = 500, validation_steps = 50):

    model.compile(
        optimizer = optimizer,
        loss = loss_function,
        metrics = metrics
    )

    history = model.fit(
        train_data.repeat(),
        epochs = epochs,
        steps_per_epoch = steps_per_epoch,
        validation_data = validation_data,
        validation_steps = validation_steps,
        callbacks = [tf.keras.callbacks.EarlyStopping(patience = 3)]
    )

    return model, history

In [19]:
trained_translator, history = compile_and_train(
    model = translator,
    train_data = train_data,
    validation_data = val_data,
    optimizer = tf.keras.optimizers.Adam(),
    loss_function = masked_loss,
    metrics = [masked_acc, masked_loss],
    epochs = 10,
    steps_per_epoch = 500,
    validation_steps = 50
    )

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 68ms/step - loss: 5.7520 - masked_acc: 0.1774 - masked_loss: 5.7520 - val_loss: 4.3547 - val_masked_acc: 0.3285 - val_masked_loss: 4.3547
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 66ms/step - loss: 4.0982 - masked_acc: 0.3659 - masked_loss: 4.0982 - val_loss: 3.2017 - val_masked_acc: 0.4786 - val_masked_loss: 3.2017
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 66ms/step - loss: 3.0342 - masked_acc: 0.5026 - masked_loss: 3.0342 - val_loss: 2.4403 - val_masked_acc: 0.5820 - val_masked_loss: 2.4403
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 67ms/step - loss: 2.3615 - masked_acc: 0.5972 - masked_loss: 2.3615 - val_loss: 2.0118 - val_masked_acc: 0.6369 - val_masked_loss: 2.0118
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - loss: 1.9935 - masked_acc: 0.6489 - masked_los

In [20]:
save_dir = "model.weights.h5"
trained_translator.save_weights(save_dir)

In [21]:
translator.load_weights(save_dir)

In [22]:
def generate_next_token(decoder, context, next_token, done, state, temperature=0.0):

    "Generates the next token in the sequence"

    logits, state = decoder(context, next_token, state=state, return_state=True)
    logits = logits[:,-1,:]

    # If temp is 0 then next_token is the argmax of logits
    if temperature == 0.0:
        next_token = tf.argmax(logits, axis = -1)

    # If temp is not 0 so temperature will used
    else:
        logits = logits / temperature
        next_token = tf.random.categorical(logits, num_samples = 1)

    logits = tf.squeeze(logits)
    next_token = tf.squeeze(next_token)

    # get the logit of the selected next_token
    logit = logits[next_token].numpy()

    next_token = tf.reshape(next_token, shape = (1,1))

    # If next_token is End-of-Sentence token you are done
    if next_token == 3: # EOS id
        done = True

    return next_token, logit, state, done

In [23]:
context = english_vectorizer(['i love languages']).to_tensor()
next_token = tf.convert_to_tensor([[2]]) # SOS id

context = encoder(context)
state = [tf.random.uniform((1, units)), tf.random.uniform((1, units))]

next_token, logit, state, done = generate_next_token(decoder, context, next_token, False, state)

In [24]:
print(f"Next token: {next_token}\nLogit: {logit:.4f}\nDone? {done}")

Next token: [[9529]]
Logit: -9.2947
Done? False


#Using the model for inference

In [25]:
def translate(model, text, max_length = 50, temperature=0.0):

    "Translate a given sentence from English to Portuguese"

    tokens, logits = [], []

    context = english_vectorizer(tf.constant([text])).to_tensor()
    context = model.encoder(context)

    next_token = tf.fill(dims = (1, 1), value = 2) # SOS id

    done = False

    state = [tf.zeros(shape=(1,units)) , tf.zeros(shape = (1,units))]

    for _ in range(max_length):

        next_token, logit, state, done = generate_next_token(
            decoder = model.decoder,
            context = context,
            next_token = next_token,
            done = done,
            state = state,
            temperature = temperature
        )

        tokens.append(tf.squeeze(next_token).numpy())
        logits.append(logit)

        if done == True:
            break

    # Convert the translated tokens into text
    translation = tokens_to_text(tokens, id_to_word).numpy().decode()

    return translation, logits[-1], tokens

In [26]:
temp = .5
original_sentence = "I love languages"

translation, logit, tokens = translate(trained_translator, original_sentence, temperature=temp)

print(f"Temperature: {temp}\n\nOriginal sentence: {original_sentence}\nTranslation: {translation}\nTranslation tokens:{tokens}\nLogit: {logit:.3f}")

Temperature: 0.5

Original sentence: I love languages
Translation: eu adoro linguas as linguas . [EOS]
Translation tokens:[9, 564, 1032, 38, 1032, 4, 3]
Logit: -0.015


#Minimum Bayes-Risk Decoding

In [27]:
def generate_samples(model, text, n_samples = 4, temperature = 0.6):

    "get any desired number of candidate translations alongside the log-probability for each one"

    samples, log_probs = [], []

    for _ in range(n_samples):

        _, logp, sample = translate(model, text, temperature = temperature)

        samples.append(sample)
        log_probs.append(logp)

    return samples, log_probs

In [28]:
samples, log_probs = generate_samples(trained_translator, 'I love languages')

In [29]:
samples, log_probs = generate_samples(trained_translator, 'I love languages')

for s, l in zip(samples, log_probs):
    print(f"Translated tensor: {s} has logit: {l:.3f}")

Translated tensor: [9, 564, 1032, 38, 1032, 4, 3] has logit: -0.012
Translated tensor: [9, 564, 1032, 11, 1032, 4, 3] has logit: -0.012
Translated tensor: [9, 564, 1032, 38, 1032, 4, 3] has logit: -0.012
Translated tensor: [9, 564, 1032, 18, 1032, 4, 3] has logit: -0.017


In [30]:
def jaccard_similarity(candidate, reference):

    "a similarity metric, calculates the similarity between any pair of candidate and reference translations"

    candidate_set = set(candidate)
    reference_set = set(reference)

    common_tokens = candidate_set.intersection(reference_set)
    all_tokens = candidate_set.union(reference_set)

    overlap = len(common_tokens) / len(all_tokens)

    return overlap

In [31]:
l1 = [1, 2, 3]
l2 = [1, 2, 3, 4]

js = jaccard_similarity(l1, l2)

print(f"jaccard similarity between lists: {l1} and {l2} is {js:.3f}")

jaccard similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.750


In [32]:
def rouge1_similarity(candidate, reference):

    "another similarity metric ,computes the ROUGE 1 score between candidate and reference"

    candidate_word_counts = Counter(candidate)
    reference_word_counts = Counter(reference)

    overlap = 0

    for token in candidate_word_counts.keys():

        token_count_candidate = candidate_word_counts[token]
        token_count_reference = reference_word_counts[token]

        overlap += min(token_count_candidate, token_count_reference)

    precision = overlap / len(candidate)
    recall = overlap / len(reference)

    if precision + recall != 0: # prevent div by 0

        # ROUGE-1
        f1_score = 2 * (precision * recall) / (precision + recall)

        return f1_score

    return 0

In [33]:
l1 = [1, 2, 3]
l2 = [1, 2, 3, 4]

r1s = rouge1_similarity(l1, l2)

print(f"rouge 1 similarity between lists: {l1} and {l2} is {r1s:.3f}")

rouge 1 similarity between lists: [1, 2, 3] and [1, 2, 3, 4] is 0.857


In [34]:
def average_overlap(samples, similarity_fn):

    "Computes the arithmetic mean of each candidate sentence in the samples"

    iters = len(samples)
    similarity = 0
    scores = {}

    for candidate in range(iters):

        for sample in range(iters):

            if candidate != sample:

                similarity += similarity_fn(samples[candidate], samples[sample])

        scores[candidate] = round(similarity / (iters - 1) , 3)
        similarity = 0

    return scores

In [35]:
l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]

avg_ovlp = average_overlap([l1, l2, l3], jaccard_similarity)

print(f"average overlap between lists: {l1}, {l2} and {l3} using Jaccard similarity is:\n\n{avg_ovlp}")

average overlap between lists: [1, 2, 3], [1, 2, 4] and [1, 2, 4, 5] using Jaccard similarity is:

{0: 0.45, 1: 0.625, 2: 0.575}


In [36]:
def weighted_avg_overlap(samples, log_probs, similarity_fn):

    "instead of arithmetic mean, compute the weighted mean of each candidate sentence in the samples"

    iters = len(samples)
    similarity = 0
    weights_sum = 0
    scores = {}

    for candidate in range(iters):

        for sample in range(iters):

            if candidate != sample:

                # Convert log probability to linear scale
                sample_p = float(np.exp(log_probs[sample]))

                similarity += sample_p * similarity_fn(samples[candidate], samples[sample])

                weights_sum += sample_p

        # the arithmetic mean means the weight for each element is 1
        # we are calculating the weighted mean so the weight not 1
        # when we calculated the arithmetic mean we divided by the sum of ones
        # in the case of weighted mean we divide by the sum of weights

        scores[candidate] = round(similarity / weights_sum , 3)
        similarity = 0
        weights_sum = 0

    return scores

In [37]:
l1 = [1, 2, 3]
l2 = [1, 2, 4]
l3 = [1, 2, 4, 5]
log_probs = [0.4, 0.2, 0.4]

w_avg_ovlp = weighted_avg_overlap([l1, l2, l3], log_probs, jaccard_similarity)

print(f"weighted average overlap using Jaccard similarity is:\n\n{w_avg_ovlp}")

weighted average overlap using Jaccard similarity is:

{0: 0.445, 1: 0.625, 2: 0.558}


In [38]:
def mbr_decode(model, text, n_samples = 5, temperature = 0.6, similarity_fn = jaccard_similarity):

    "get translations with Minimum Bayes-Risk"

    samples, log_probs = generate_samples(model, text, n_samples, temperature)

    # Compute the overlap scores
    scores = weighted_avg_overlap(samples, log_probs, similarity_fn)

    # Find the key with the highest score
    max_score_key = max(scores, key = lambda x: scores[x])

    translations = [tokens_to_text(s, id_to_word).numpy().decode('utf-8') for s in samples]

    best_translation = translations[max_score_key]

    return best_translation, translations

In [39]:
english_sentence = "do you like playing football"

translation, candidates = mbr_decode(translator, english_sentence, n_samples=10, temperature=0.6)

In [40]:
translation

'voce gosta de jogar futebol . [EOS]'

In [41]:
print("Translation candidates:")

for c in candidates:
    print(c)

print(f"\nSelected translation: {translation}")

Translation candidates:
voce gosta de jogar futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voces gostam de jogar futebol de futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voce gosta de jogar futebol . [EOS]
voces gostam de jogar futebol . [EOS]
voce gosta de jogar futebol ! [EOS]

Selected translation: voce gosta de jogar futebol . [EOS]
