<a href="https://colab.research.google.com/github/FranciscoBPereira/DeepLearning-SeAMK/blob/main/SeAMK2223_Ex2_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setup, Version check and Common imports

# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 10)


# TensorFlow ≥2.0 is required
import tensorflow as tf
assert tf.__version__ >= "2.10"

# Common imports
import numpy as np
import os

from tensorflow import keras
from tensorflow.keras import layers

# to make this notebook's output stable across runs
np.random.seed(42)

import matplotlib.pyplot as plt

plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

In [None]:
# Upload file "pt.txt" to the working directory

# Open file and add "start" and "end" to the target strings

text_file = 'pt.txt'
with open(text_file) as f:
    lines = f.read().split("\n")[:-1]
or_text_pairs = []
for line in lines:
    english, port = line.split("\t")
    port = "[start] " + port + " [end]"
    or_text_pairs.append((english, port))

In [None]:
# Visualize a random sample of 10 examples

import random
for x in range(10):
  print(x+1 , ": ", random.choice(or_text_pairs))

In [None]:
# To speed-up training, we can consider just a fraction of the original dataset
# Performance will degrade with the lack of data

# 80% for training and 20% for validation

fraction = 0.15

random.shuffle(or_text_pairs)

slice = int(len(or_text_pairs)*0.15)

text_pairs = or_text_pairs[ : slice
                                                  ]
num_val_samples = int(0.20 * len(text_pairs))
num_train_samples = len(text_pairs) - num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:]

print('Original: ', len(or_text_pairs))

print('Sliced: ', len(text_pairs))

print('Training: ', len(train_pairs))

print('Validation: ', len(val_pairs))


In [None]:
# Preprocessing, Vectorization and Pipelining (you can skip these details)

import string
import re

strip_chars = string.punctuation
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")


def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
        lowercase, f"[{re.escape(strip_chars)}]", "")

# Vectorization settings
sequence_length = 20
vocab_size = 5000

# English Vectorization
source_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length,
)

# Portuguese Vectorization
target_vectorization = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode="int",
    output_sequence_length=sequence_length + 1,
    standardize=custom_standardization,
)

# Get vocabulary
train_english_texts = [pair[0] for pair in train_pairs]
train_pt_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_pt_texts)

# Create Dataset object for training (https://www.tensorflow.org/api_docs/python/tf/data)

# Tuples (inputs, target)
# Inputs: Dictionary with 2 keys: "encoder_inputs" and "decoder_inputs",
# corresponding the sentence in portuguese and english
# Target: Sentence in portuguese with a one position offset to the right

batch_size = 64


def format_dataset(eng, pt):
    eng = source_vectorization(eng)
    pt = target_vectorization(pt)
    return ({
        "english": eng,
        "portuguese": pt[:, :-1],
    }, pt[:, 1:])

def make_dataset(pairs):
    eng_texts, pt_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    pt_texts = list(pt_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, pt_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls=4)
    return dataset.shuffle(2048).prefetch(16).cache()

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [None]:
# The Seq2Seq Model

# GRU-based encoder, with a bidirectional set of GRU cells
# The encoder output is given to the decoder

embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None,), dtype="int64", name="english")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim), merge_mode="sum")(x)

# GRU-based decoder
# Unidirectional GRU layer that receives the output of the encoder as the initial state
# Final dense layer with softmax activation. It predicts the next word

past_target = keras.Input(shape=(None,), dtype="int64", name="portuguese")
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)
decoder_gru = layers.GRU(latent_dim, return_sequences=True)
x = decoder_gru(x, initial_state=encoded_source)
x = layers.Dropout(0.5)(x)
target_next_step = layers.TimeDistributed(layers.Dense(vocab_size, activation="softmax"))(x)

# Complete Model
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

In [None]:
seq2seq_rnn.summary()

In [None]:
# Compile and Train

seq2seq_rnn.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])


history = seq2seq_rnn.fit(train_ds, epochs=15, validation_data=val_ds)



In [None]:
# Plot the evolution of the accuracy metrics

import pandas as pd

x = pd.DataFrame(history.history, columns = ['accuracy', 'val_accuracy'])
x.plot(figsize=(8, 5))
plt.grid(True)
plt.show()

In [None]:
# Try translate a few English sentences to Portuguese

pt_vocab = target_vectorization.get_vocabulary()
pt_index_lookup = dict(zip(range(len(pt_vocab)), pt_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])
        next_token_predictions = seq2seq_rnn.predict(
            [tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        sampled_token = pt_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_tok
        en == "[end]":
            break
    return decoded_sentence

for _ in range(5):
    input_sentence = input()
    print("-")
    print(input_sentence)
    print(decode_sequence(input_sentence), '\n')