<a href="https://colab.research.google.com/github/FranciscoBPereira/AnaliseDados_2425_MEI_ISEC/blob/main/AD2425_P9B_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setup, Version check and Common imports

# Python ≥3.8 is required
import sys
assert sys.version_info >= (3, 5)


# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.0"

# Common imports
import numpy as np
import os

from tensorflow import keras
from tensorflow.keras import layers

# to make this notebook's output stable across runs
np.random.seed(42)

import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

print('Python version: ', sys.version_info)
print('TF version: ', tf.__version__)
print('Keras version: ', keras.__version__)
print('GPU is', 'available' if tf.config.list_physical_devices('GPU') else 'NOT AVAILABLE')

**1. Dataset Loading, Preprocessing and Vectorization**

**1.1. Loading, Slicing and Dividing**

In [None]:
# Upload file "por.txt" to the working directory
# If you place the file in another directory, adjust the path in first line of code
# Add "start" and "end" to target strings

text_file = 'por.txt'
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
or_text_pairs = []
for line in lines:
    english, port = line.split("\t")
    port = "[start] " + port + " [end]"
    or_text_pairs.append((english, port))

In [None]:
# Visualize a random sample of 10 examples


import random
for x in range(10):
  print(x+1 , ": ", random.choice(or_text_pairs))

In [None]:
# Keep just 15% of the original dataset (for efficiency reasons). This will hurt performance
# Split the remaining dataset in training (80%) and validation (20%)

import random
random.shuffle(or_text_pairs)

slice = int(len(or_text_pairs)*0.15)

text_pairs = or_text_pairs[ : slice]

num_val_samples = int(0.20 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]


In [None]:
# Check dimensions

print('Original: ', len(or_text_pairs), '\n')

print('Sliced: ', len(text_pairs))

print('Training: ', len(train_pairs))

print('Validation: ', len(val_pairs))


**1.2. Preprocessing and Vectorization**

In [None]:
# Text vectorization
# There is a method for standardization of the Portuguese text, in order to keep the special symbols [ ]

import string
import re

strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

# Vectorization parameters
sequence_length = 20
vocab_size = 5000

# Vectorization layer for English
source_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Vectorization layer for Portuguese
target_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

# Get the vocabulary for each language
train_english_texts = [pair[0] for pair in train_pairs]
train_pt_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_pt_texts)

In [None]:
# English vocabulary

source_vectorization.get_vocabulary()

In [None]:
# Portuguese vocabulary

target_vectorization.get_vocabulary()

**1.3. Pipelining**

In [None]:
# Create Dataset objects for training
# https://www.tensorflow.org/api_docs/python/tf/data

# Objects are tuples (inputs, target)
# inputs: dictionary with 2 keys "encoder_inputs" and "decoder_inputs", corresponding to the English and Portuguese sentences
# target: Portuguese sentence, with the offset of one position to the right

batch_size = 64


def format_dataset(eng, pt):
    eng = source_vectorization(eng)
    pt = target_vectorization(pt)
    return ({
        "english": eng,
        "portuguese": pt[:, :-1],
    }, pt[:, 1:])

def make_dataset(pairs):
    eng_texts, pt_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    pt_texts = list(pt_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, pt_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
# Visualize the format

for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape: {inputs['english'].shape}")
    print(f"inputs['portuguese'].shape: {inputs['portuguese'].shape}")
    print(f"targets.shape: {targets.shape}")
    print(inputs['english'])

Quiz:

Identify all preprocessing operations that are performed to the text, since it is loaded from the file until it is ready to enter the neural network.
For each case, explain with a simple sentence the goal of the operation.





**2. The Seq2Seq Model**

In [None]:
# GRU-based encoder
# One bidirectional layer of GRU cells
# The layer output is sent to the decoder

embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim), merge_mode="sum")(x)

In [None]:
# GRU-based decoder
# One directional layer of GRU cells
# It has a final Dense layer with dimension of the vocabulary and with softmax activation function. It predicts the next word.

past_target = keras.Input(shape=(None,), dtype="int64", name="portuguese")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.Dense(vocab_size, activation="softmax")(x)

# Full encoder-decoder model
seq2seq_rnn = keras.Model([source, past_target], target_next_step)


In [None]:
seq2seq_rnn.summary()

Quiz:

1. In the encoder the GRU layer is bidirectional, whereas in the decoder it is a standard unidirectional recurrent layer. Explain the advantages/constraints that justify this difference.

2. Specify:

  a) The dimensions of the embeddings in the encoder and in the decoder

  b) The number of GRU cells in the encoder and in the decoder

  c) The dimensions of the tensor that contains the information passed from the encoder to the decoder


In [None]:
# Compile and train

seq2seq_rnn.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)

**3. Translating new Sentences**

In [None]:
# Try translate a few sentences from English to Portuguese

pt_vocab = target_vectorization.get_vocabulary()
pt_index_lookup = dict(zip(range(len(pt_vocab)), pt_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = pt_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

for _ in range(5):
    input_sentence = input()
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence), '\n')