In [1]:
!pip install d2l




 Importar Bibliotecas e Configurar o Dataset

In [2]:
!pip uninstall numpy -y
!pip install numpy==1.21.6


Found existing installation: numpy 1.23.5
Uninstalling numpy-1.23.5:
  Successfully uninstalled numpy-1.23.5
[31mERROR: Ignored the following versions that require a different python version: 1.21.2 Requires-Python >=3.7,<3.11; 1.21.3 Requires-Python >=3.7,<3.11; 1.21.4 Requires-Python >=3.7,<3.11; 1.21.5 Requires-Python >=3.7,<3.11; 1.21.6 Requires-Python >=3.7,<3.11[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement numpy==1.21.6 (from versions: 1.3.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.6.1, 1.6.2, 1.7.0, 1.7.1, 1.7.2, 1.8.0, 1.8.1, 1.8.2, 1.9.0, 1.9.1, 1.9.2, 1.9.3, 1.10.0.post2, 1.10.1, 1.10.2, 1.10.4, 1.11.0, 1.11.1, 1.11.2, 1.11.3, 1.12.0, 1.12.1, 1.13.0, 1.13.1, 1.13.3, 1.14.0, 1.14.1, 1.14.2, 1.14.3, 1.14.4, 1.14.5, 1.14.6, 1.15.0, 1.15.1, 1.15.2, 1.15.3, 1.15.4, 1.16.0, 1.16.1, 1.16.2, 1.16.3, 1.16.4, 1.16.5, 1.16.6, 1.17.0, 1.17.1, 1.17.2, 1.17.3, 1.17.4, 1.17.5, 1.18.0, 1.18.1, 1.18.2, 1.18.3, 1.18.4, 1.18.5, 1.19.0, 1.19.1, 1.19.2, 1.19.3, 1.19.4, 

In [4]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from d2l import torch as d2l  # Use a versão PyTorch do d2l

# Configurar o dataset do Projeto Tatoeba
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + 'fra-eng.zip',
                           '94646ad1522d915e7b0f9296181140edcf86a4f5')


 Download e Pré‑processamento do Conjunto de Dados

In [5]:
def read_data_nmt():
    """Carrega o conjunto de dados inglês-francês."""
    data_dir = d2l.download_extract('fra-eng')
    with open(os.path.join(data_dir, 'fra.txt'), 'r') as f:
        return f.read()

raw_text = read_data_nmt()
print(raw_text[:75])


Downloading ../data/fra-eng.zip from http://d2l-data.s3-accelerate.amazonaws.com/fra-eng.zip...
Go.	Va !
Hi.	Salut !
Run!	Cours !
Run!	Courez !
Who?	Qui ?
Wow!	Ça alors !



Pré‑processamento e Tokenização

In [6]:
def preprocess_nmt(text):
    """Pré-processa o conjunto de dados: substitui espaços não separáveis, converte para minúsculas
    e insere espaços entre palavras e pontuações."""
    def no_space(char, prev_char):
        return char in set(',.!?') and prev_char != ' '
    text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower()
    out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char
           for i, char in enumerate(text)]
    return ''.join(out)

text = preprocess_nmt(raw_text)
print(text[:80])

def tokenize_nmt(text, num_examples=None):
    """Tokeniza o conjunto de dados em pares de frases (origem e destino)."""
    source, target = [], []
    for i, line in enumerate(text.split('\n')):
        if num_examples and i > num_examples:
            break
        parts = line.split('\t')
        if len(parts) == 2:
            source.append(parts[0].split(' '))
            target.append(parts[1].split(' '))
    return source, target

source, target = tokenize_nmt(text)
print(source[:6], target[:6])


go .	va !
hi .	salut !
run !	cours !
run !	courez !
who ?	qui ?
wow !	ça alors !
[['go', '.'], ['hi', '.'], ['run', '!'], ['run', '!'], ['who', '?'], ['wow', '!']] [['va', '!'], ['salut', '!'], ['cours', '!'], ['courez', '!'], ['qui', '?'], ['ça', 'alors', '!']]


Construção do Vocabulário e Preparação dos Dados

In [7]:
# Criação dos vocabulários para os dois idiomas
src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
print("Tamanho do vocabulário de origem:", len(src_vocab))
print("Tamanho do vocabulário de destino:", len(tgt_vocab))

def truncate_pad(line, num_steps, padding_token):
    """Trunca ou preenche uma sequência para ter comprimento fixo."""
    if len(line) > num_steps:
        return line[:num_steps]
    return line + [padding_token] * (num_steps - len(line))

def build_array_nmt(lines, vocab, num_steps):
    """Converte sequências de texto em arrays, adicionando o token <eos> e aplicando truncamento/padding."""
    lines = [vocab[l] for l in lines]
    lines = [l + [vocab['<eos>']] for l in lines]
    array = [truncate_pad(l, num_steps, vocab['<pad>']) for l in lines]
    valid_len = [sum(token != vocab['<pad>'] for token in l) for l in array]
    return torch.tensor(array), torch.tensor(valid_len)

def load_data_nmt(batch_size, num_steps, num_examples=600):
    """Retorna o iterador de dados e os vocabulários para a tradução automática."""
    text = preprocess_nmt(read_data_nmt())
    source, target = tokenize_nmt(text, num_examples)
    src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>'])
    src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps)
    tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps)
    data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len)
    return d2l.load_array(data_arrays, batch_size), src_vocab, tgt_vocab

batch_size = 2
num_steps = 8
train_iter, src_vocab, tgt_vocab = load_data_nmt(batch_size, num_steps, num_examples=600)

for X, X_valid_len, Y, Y_valid_len in train_iter:
    print('X:', X)
    print('Comprimentos válidos de X:', X_valid_len)
    print('Y:', Y)
    print('Comprimentos válidos de Y:', Y_valid_len)
    break


Tamanho do vocabulário de origem: 10012
Tamanho do vocabulário de destino: 17851
X: tensor([[ 58, 158,   2,   4,   5,   5,   5,   5],
        [ 59,  75,   2,   4,   5,   5,   5,   5]])
Comprimentos válidos de X: tensor([4, 4])
Y: tensor([[188,  43, 179,   2,   4,   5,   5,   5],
        [155, 197, 105, 115,   2,   4,   5,   5]])
Comprimentos válidos de Y: tensor([5, 6])


Definição do Modelo Seq2Seq (Encoder, Decoder e Modelo Completo)

In [8]:
# Encoder: codifica a sequência de entrada
class Encoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)

    def forward(self, x, valid_len):
        # x: (batch, seq_len)
        embedded = self.embedding(x)
        # Usa pack_padded_sequence para ignorar os tokens de padding
        packed = nn.utils.rnn.pack_padded_sequence(embedded, valid_len.cpu(), batch_first=True, enforce_sorted=False)
        _, hidden = self.gru(packed)
        return hidden

# Decoder: gera a sequência de saída
class Decoder(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        # x: (batch, 1) – o token atual de entrada
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded, hidden)
        output = self.fc(output.squeeze(1))  # (batch, vocab_size)
        return output, hidden

# Modelo Seq2Seq que une Encoder e Decoder
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, src_valid_len, tgt, teacher_forcing_ratio=0.5):
        batch_size, tgt_len = tgt.size()
        vocab_size = self.decoder.embedding.num_embeddings
        outputs = torch.zeros(batch_size, tgt_len, vocab_size).to(self.device)

        hidden = self.encoder(src, src_valid_len)
        # O primeiro token do target (<bos>) é a entrada inicial do decoder
        input = tgt[:, 0].unsqueeze(1)  # shape (batch, 1)

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1).unsqueeze(1)
            input = tgt[:, t].unsqueeze(1) if teacher_force else top1
        return outputs

# Configurar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hiperparâmetros
ENC_EMB_DIM = 32
DEC_EMB_DIM = 32
HID_DIM = 64

encoder = Encoder(len(src_vocab), ENC_EMB_DIM, HID_DIM).to(device)
decoder = Decoder(len(tgt_vocab), DEC_EMB_DIM, HID_DIM).to(device)
model = Seq2Seq(encoder, decoder, device).to(device)


 Treinamento do Modelo

In [10]:
# Função de perda ignorando os tokens de padding (<pad>)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_vocab['<pad>'])
optimizer = optim.Adam(model.parameters())

def train_epoch(model, data_iter, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0
    correct_tokens = 0
    total_tokens = 0

    for src, src_valid_len, tgt, _ in data_iter:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        outputs = model(src, src_valid_len, tgt, teacher_forcing_ratio=0.5)
        # outputs: (batch, tgt_len, vocab_size) e tgt: (batch, tgt_len)
        output_dim = outputs.shape[-1]
        outputs_reshaped = outputs[:, 1:].reshape(-1, output_dim)
        tgt_reshaped = tgt[:, 1:].reshape(-1)

        loss = criterion(outputs_reshaped, tgt_reshaped)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

        # Cálculo da acurácia dos tokens (ignorando padding)
        predictions = outputs_reshaped.argmax(1)
        mask = tgt_reshaped != tgt_vocab['<pad>']
        correct_tokens += (predictions[mask] == tgt_reshaped[mask]).sum().item()
        total_tokens += mask.sum().item()

    avg_loss = epoch_loss / len(data_iter)
    accuracy = correct_tokens / total_tokens if total_tokens > 0 else 0
    return avg_loss, accuracy

EPOCHS = 20
CLIP = 1.0

for epoch in range(EPOCHS):
    loss, accuracy = train_epoch(model, train_iter, optimizer, criterion, CLIP)
    print(f'Época {epoch+1}, Loss: {loss:.4f}, Accuracy: {accuracy:.4f}')


Época 1, Loss: 0.8841, Accuracy: 0.7608
Época 2, Loss: 0.7650, Accuracy: 0.7830
Época 3, Loss: 0.7564, Accuracy: 0.7866
Época 4, Loss: 0.6718, Accuracy: 0.8062
Época 5, Loss: 0.6000, Accuracy: 0.8289
Época 6, Loss: 0.5616, Accuracy: 0.8268
Época 7, Loss: 0.5344, Accuracy: 0.8438
Época 8, Loss: 0.4787, Accuracy: 0.8552
Época 9, Loss: 0.4483, Accuracy: 0.8624
Época 10, Loss: 0.4131, Accuracy: 0.8680
Época 11, Loss: 0.3844, Accuracy: 0.8727
Época 12, Loss: 0.3722, Accuracy: 0.8742
Época 13, Loss: 0.3449, Accuracy: 0.8861
Época 14, Loss: 0.3389, Accuracy: 0.8840
Época 15, Loss: 0.2773, Accuracy: 0.8974
Época 16, Loss: 0.2928, Accuracy: 0.8887
Época 17, Loss: 0.3108, Accuracy: 0.8923
Época 18, Loss: 0.2918, Accuracy: 0.8959
Época 19, Loss: 0.2763, Accuracy: 0.8954
Época 20, Loss: 0.2861, Accuracy: 0.8876
