# Autores: 
**Gabriel Roberto (221020870) e Jean Soares (241033810)**

In [150]:
from d2l import tensorflow as d2l
import requests
import tarfile
import re
import collections
import pandas as pd
from nltk.translate import IBMModel1
from nltk.translate.bleu_score import sentence_bleu


In [151]:
def baixar_arquivo(url, endereco):
    resposta = requests.get(url)
    if resposta.status_code == requests.codes.OK:
        with open(endereco, 'wb') as novo_arquivo:
                novo_arquivo.write(resposta.content)
        print("Download finalizado. Arquivo salvo em: {}".format(endereco))
    else:
        resposta.raise_for_status()

In [152]:
URL_arquivo = 'https://www.statmt.org/europarl/v7/pt-en.tgz'

baixar_arquivo(URL_arquivo, './raw_dataset.tgz')

Download finalizado. Arquivo salvo em: ./raw_dataset.tgz


In [153]:
# Caminho para o arquivo .tgz
caminho_arquivo = './raw_dataset.tgz'

# Abrir o arquivo .tgz
with tarfile.open(caminho_arquivo, 'r:gz') as arquivo_tgz:
    # Extrair todos os arquivos do .tgz
    arquivo_tgz.extractall(path='./')

    # Listar os arquivos extraídos
    for membro in arquivo_tgz.getmembers():
        print(membro.name)

europarl-v7.pt-en.en
europarl-v7.pt-en.pt


# Leitura do Dataset

In [154]:
class ParallelCorpus(d2l.DataModule):
    """The Parallel Corpus Portuguese-English dataset."""
    def _download(self, fname):
        with open(fname, encoding="utf-8") as f:
            return f.read()

In [155]:
data = ParallelCorpus()
raw_text_en = data._download('./europarl-v7.pt-en.en')
raw_text_en[:60]

'Resumption of the session\nI declare resumed the session of t'

In [156]:
raw_text_pt = data._download('./europarl-v7.pt-en.pt')
raw_text_pt[:60]

'Reinício da sessão\nDeclaro reaberta a sessão do Parlamento E'

In [157]:
@d2l.add_to_class(ParallelCorpus)
def _preprocess(self, text):
    return re.sub(r'[^\w\s]', '', text).lower()

In [None]:
# Removendo dez mil senteças para facilitar o processamento
text_en = data._preprocess(" ".join(raw_text_en.split()[:10000]))

text_en[:60]

'resumption of the session i declare resumed the session of t'

In [None]:
# Removendo dez mil senteça para facilitar o processamento
text_pt = data._preprocess(' '.join(raw_text_pt.split()[:10000]))

text_pt[:60]

'reinício da sessão declaro reaberta a sessão do parlamento e'

# Tokenização

In [160]:
source_sentences = text_en.split()
target_sentences = text_pt.split()

print(len(source_sentences), len(target_sentences))

9956 9954


In [161]:
def preprocess2(text):
  return re.sub(r'[^a-z]+', ' ', text).split()

def get_diff_words(text):
  set_words = set()
  text_list = preprocess2(text)

  for word in text_list:
    set_words.add(word)

  return set_words

In [162]:
text_en_diff = get_diff_words(text_en)
text_en_diff

{'contribute',
 'greater',
 'lament',
 'further',
 'tabled',
 'population',
 'details',
 'high',
 'the',
 'value',
 'councils',
 'millennium',
 'nations',
 'home',
 'use',
 'wording',
 'acceptable',
 'finding',
 'sectors',
 'limit',
 'introduced',
 'request',
 'issues',
 'courts',
 'most',
 'item',
 'provides',
 'looking',
 'colleges',
 'applause',
 'happy',
 'quaestors',
 'zimeray',
 'document',
 'paragraph',
 'seeking',
 'agenda',
 'hell',
 'transitional',
 'attend',
 'say',
 'spent',
 'shown',
 'amount',
 'card',
 'perceive',
 'considerably',
 'amendment',
 'eyes',
 'demographic',
 'regional',
 'execution',
 'period',
 'be',
 'portuguese',
 'intended',
 'retained',
 'outcome',
 'whom',
 'conferred',
 'fields',
 'whole',
 'abc',
 'responsibility',
 'ec',
 'seem',
 'press',
 'problem',
 'regulatory',
 'believes',
 'partially',
 'sealing',
 'indicated',
 'explanations',
 'increased',
 'financially',
 'begin',
 'unnecessary',
 'propose',
 'somewhat',
 'leave',
 'resulting',
 'fundamenta

In [163]:
@d2l.add_to_class(ParallelCorpus)
def _tokenize(self, text):
    return list(text)

tokens = data._tokenize(text_en)
','.join(tokens[:30])

'r,e,s,u,m,p,t,i,o,n, ,o,f, ,t,h,e, ,s,e,s,s,i,o,n, ,i, ,d,e'

# Vocabulário

In [164]:
class Vocab:
    """Vocabulary for text."""
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        # Flatten a 2D list if needed
        if tokens and isinstance(tokens[0], list):
            tokens = [token for line in tokens for token in line]
        # Count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x: x[1],
                                  reverse=True)
        # The list of unique tokens
        self.idx_to_token = list(sorted(set(['<unk>'] + reserved_tokens + [
            token for token, freq in self.token_freqs if freq >= min_freq])))
        self.token_to_idx = {token: idx
                             for idx, token in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if hasattr(indices, '__len__') and len(indices) > 1:
            return [self.idx_to_token[int(index)] for index in indices]
        return self.idx_to_token[indices]

    @property
    def unk(self):  # Index for the unknown token
        return self.token_to_idx['<unk>']

In [165]:
vocab = Vocab(tokens)
indices = vocab[tokens[:20]]
print('indices:', indices)
print('words:', vocab.to_tokens(indices))

indices: [29, 16, 30, 32, 24, 27, 31, 20, 26, 25, 0, 26, 17, 0, 31, 19, 16, 0, 30, 16]
words: ['r', 'e', 's', 'u', 'm', 'p', 't', 'i', 'o', 'n', ' ', 'o', 'f', ' ', 't', 'h', 'e', ' ', 's', 'e']


# Definindo o Modelo

In [166]:
@d2l.add_to_class(ParallelCorpus)
def build(self, raw_text, vocab=None):
    tokens = self._tokenize(self._preprocess(raw_text))
    if vocab is None: vocab = Vocab(tokens)
    corpus = [vocab[token] for token in tokens]
    return corpus, vocab

corpus, vocab = data.build(raw_text_en)
len(corpus), len(vocab)

(289559458, 180)

In [167]:
words = text_en.split()
vocab = Vocab(words)
vocab.token_freqs[:10]

[('the', 734),
 ('of', 360),
 ('to', 343),
 ('and', 229),
 ('in', 228),
 ('that', 185),
 ('is', 177),
 ('a', 175),
 ('this', 160),
 ('i', 144)]

In [168]:
bigram_tokens = ['--'.join(pair) for pair in zip(words[:-1], words[1:])]
bigram_vocab = Vocab(bigram_tokens)
bigram_vocab.token_freqs[:10]

[('of--the', 112),
 ('the--commission', 61),
 ('in--the', 57),
 ('on--the', 53),
 ('i--would', 47),
 ('like--to', 44),
 ('for--the', 43),
 ('to--the', 39),
 ('that--the', 38),
 ('it--is', 31)]

In [169]:
trigram_tokens = ['--'.join(triple) for triple in zip(
    words[:-2], words[1:-1], words[2:])]
trigram_vocab = Vocab(trigram_tokens)
trigram_vocab.token_freqs[:10]

[('i--would--like', 23),
 ('would--like--to', 21),
 ('of--dangerous--goods', 19),
 ('the--transport--of', 15),
 ('transport--of--dangerous', 15),
 ('madam--president--i', 13),
 ('that--the--commission', 13),
 ('president--i--would', 12),
 ('the--committee--on', 12),
 ('i--should--like', 11)]

In [170]:
source_sentences = text_en.split()
target_sentences = text_pt.split()

print(len(source_sentences), len(target_sentences))

9956 9954


In [171]:
from nltk import AlignedSent

bitext = []
bitext.append(AlignedSent(source_sentences, target_sentences))
ibm_model = IBMModel1(bitext, 5)  # 5 iterations for training


## Codificador-Decodificador

In [172]:
import tensorflow as tf

class Seq2SeqEncoder(d2l.Encoder):  #@save
    """The RNN encoder for sequence-to-sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)

    def call(self, X, *args):
        # X shape: (batch_size, num_steps)
        embs = self.embedding(tf.transpose(X))
        # embs shape: (num_steps, batch_size, embed_size)
        outputs, state = self.rnn(embs)
        # outputs shape: (num_steps, batch_size, num_hiddens)
        # state shape: (num_layers, batch_size, num_hiddens)
        return outputs, state

In [173]:
vocab_size, embed_size, num_hiddens, num_layers = 10, 8, 16, 2
batch_size, num_steps = 4, 9
encoder = Seq2SeqEncoder(vocab_size, embed_size, num_hiddens, num_layers)
X = tf.zeros((batch_size, num_steps))
enc_outputs, enc_state = encoder(X)
d2l.check_shape(enc_outputs, (num_steps, batch_size, num_hiddens))

In [174]:
d2l.check_len(enc_state, num_layers)
d2l.check_shape(enc_state[0], (batch_size, num_hiddens))

In [175]:
class Seq2SeqDecoder(d2l.Decoder):
    """The RNN decoder for sequence to sequence learning."""
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 dropout=0):
        super().__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embed_size)
        self.rnn = d2l.GRU(num_hiddens, num_layers, dropout)
        self.dense = tf.keras.layers.Dense(vocab_size)

    def init_state(self, enc_all_outputs, *args):
        return enc_all_outputs

    def call(self, X, state):
        # X shape: (batch_size, num_steps)
        # embs shape: (num_steps, batch_size, embed_size)
        embs = self.embedding(tf.transpose(X))
        enc_output, hidden_state = state
        # context shape: (batch_size, num_hiddens)
        context = enc_output[-1]
        # Broadcast context to (num_steps, batch_size, num_hiddens)
        context = tf.tile(tf.expand_dims(context, 0), (embs.shape[0], 1, 1))
        # Concat at the feature dimension
        embs_and_context = tf.concat((embs, context), -1)
        outputs, hidden_state = self.rnn(embs_and_context, hidden_state)
        outputs = tf.transpose(self.dense(outputs), (1, 0, 2))
        # outputs shape: (batch_size, num_steps, vocab_size)
        # hidden_state shape: (num_layers, batch_size, num_hiddens)
        return outputs, [enc_output, hidden_state]

In [176]:
decoder = Seq2SeqDecoder(vocab_size, embed_size, num_hiddens, num_layers)
state = decoder.init_state(encoder(X))
dec_outputs, state = decoder(X, state)
d2l.check_shape(dec_outputs, (batch_size, num_steps, vocab_size))
d2l.check_len(state[1], num_layers)
d2l.check_shape(state[1][0], (batch_size, num_hiddens))

In [177]:
class Seq2Seq(d2l.EncoderDecoder):  #@save
    """The RNN encoder--decoder for sequence to sequence learning."""
    def __init__(self, encoder, decoder, tgt_pad, lr):
        super().__init__(encoder, decoder)
        self.save_hyperparameters()

    def validation_step(self, batch):
        Y_hat = self(*batch[:-1])
        self.plot('loss', self.loss(Y_hat, batch[-1]), train=False)

    def configure_optimizers(self):
        # Adam optimizer is used here
        return tf.keras.optimizers.Adam(learning_rate=self.lr)

In [178]:
@d2l.add_to_class(Seq2Seq)
def loss(self, Y_hat, Y):
    l = super(Seq2Seq, self).loss(Y_hat, Y, averaged=False)
    mask = tf.cast(tf.reshape(Y, -1) != self.tgt_pad, tf.float32)
    return tf.reduce_sum(l * mask) / tf.reduce_sum(mask)

In [186]:
@d2l.add_to_class(ParallelCorpus)
def get_dataloader(self, train):
    return d2l.load_data_nmt(batch_size=2, num_steps=10, num_examples=1000)

In [187]:
embed_size, num_hiddens, num_layers, dropout = 256, 256, 2, 0.2
with d2l.try_gpu():
    encoder = Seq2SeqEncoder(
        len(source_sentences), embed_size, num_hiddens, num_layers, dropout)
    decoder = Seq2SeqDecoder(
        len(target_sentences), embed_size, num_hiddens, num_layers, dropout)
    model = Seq2Seq(encoder, decoder, tgt_pad=vocab['<pad>'], lr=0.005)
trainer = d2l.Trainer(max_epochs=30, gradient_clip_val=1)

trainer.fit(model, data)

TypeError: '_BatchDataset' object is not subscriptable

In [34]:
@d2l.add_to_class(d2l.EncoderDecoder)  #@save
def predict_step(self, batch, device, num_steps,
                 save_attention_weights=False):
    src, tgt, src_valid_len, _ = batch
    enc_all_outputs = self.encoder(src, src_valid_len, training=False)
    dec_state = self.decoder.init_state(enc_all_outputs, src_valid_len)
    outputs, attention_weights = [tf.expand_dims(tgt[:, 0], 1), ], []
    for _ in range(num_steps):
        Y, dec_state = self.decoder(outputs[-1], dec_state, training=False)
        outputs.append(tf.argmax(Y, 2))
        # Save attention weights (to be covered later)
        if save_attention_weights:
            attention_weights.append(self.decoder.attention_weights)
    return tf.concat(outputs[1:], 1), attention_weights

In [None]:
engs = ['go .', 'i lost .', 'he\'s calm .', 'i\'m home .']
ports = ['vai .', 'eu perdi .', 'ele está calmo .', 'estou em casa .']
preds, _ = model.predict_step(
    data.build(engs, ports), d2l.try_gpu(), data.num_steps)
for en, pt, p in zip(engs, ports, preds):
    translation = []
    for token in vocab.to_tokens(p):
        if token == '<eos>':
            break
        translation.append(token)
    print(f'{en} => {translation}, bleu,'
          f'{sentence_bleu([pt.split()], translation, weights=(1, 0, 0, 0))}')