In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [16]:
from datasets import load_dataset
nmt_dataset = load_dataset("Helsinki-NLP/opus_books", "en-es")  # only if you trust the user
split = nmt_dataset["train"].train_test_split(train_size=0.8, seed=42)
nmt_train_set_temp, nmt_test_set = split["train"], split["test"]
split = nmt_train_set_temp.train_test_split(train_size=0.8, seed=42)
nmt_train_set, nmt_valid_set = split["train"], split["test"]

print(nmt_train_set)
print(nmt_valid_set)
print(nmt_test_set)

Dataset({
    features: ['id', 'translation'],
    num_rows: 59820
})
Dataset({
    features: ['id', 'translation'],
    num_rows: 14956
})
Dataset({
    features: ['id', 'translation'],
    num_rows: 18694
})


In [17]:
nmt_train_set[0]

{'id': '73782',
 'translation': {'en': '"Yes, sir," the captain replied, "and if I have no hesitation in treading this polar soil, it\'s because no human being until now has left a footprint here."',
  'es': '-Sí, señor, en efecto -respondió el capitán-, y lo hago sin vacilación porque ningún ser humano ha plantado hasta ahora el pie en esta tierra del Polo.'}}

In [18]:
import tokenizers

def train_eng_spa():  # a generator function to iterate over all training text
    for pair in nmt_train_set["translation"]:
        yield pair["en"]
        yield pair["es"]

max_length = 500
vocab_size = 10_000
nmt_tokenizer_model = tokenizers.models.BPE(unk_token="<unk>")
nmt_tokenizer = tokenizers.Tokenizer(nmt_tokenizer_model)
nmt_tokenizer.enable_padding(pad_id=0, pad_token="<pad>")
nmt_tokenizer.enable_truncation(max_length=max_length)
nmt_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Whitespace()
nmt_tokenizer_trainer = tokenizers.trainers.BpeTrainer(vocab_size=vocab_size, special_tokens=["<pad>", "<unk>", "<s>", "</s>"])
nmt_tokenizer.train_from_iterator(train_eng_spa(), nmt_tokenizer_trainer)






In [19]:
nmt_tokenizer.encode("I like soccer").ids

[42, 652, 232, 66, 322]

In [45]:
print([nmt_tokenizer.id_to_token(id) for id in [42, 652, 232, 66, 322]])

['I', 'like', 'so', 'c', 'cer']


In [46]:
from collections import namedtuple
from torch.utils.data import DataLoader

fields = ["src_token_ids", "src_mask", "tgt_token_ids", "tgt_mask"]
class NmtPair(namedtuple("NmtPairBase", fields)):
    def to(self, device):
        return NmtPair(self.src_token_ids.to(device), self.src_mask.to(device),
                       self.tgt_token_ids.to(device), self.tgt_mask.to(device))
        
def nmt_collate_fn(batch):
    src_texts = [pair['translation']['en'] for pair in batch]
    tgt_texts = [f"<s> {pair['translation']['es']} </s>" for pair in batch]
    src_encodings = nmt_tokenizer.encode_batch(src_texts)
    tgt_encodings = nmt_tokenizer.encode_batch(tgt_texts)
    src_token_ids = torch.tensor([enc.ids for enc in src_encodings])
    tgt_token_ids = torch.tensor([enc.ids for enc in tgt_encodings])
    src_mask = torch.tensor([enc.attention_mask for enc in src_encodings])
    tgt_mask = torch.tensor([enc.attention_mask for enc in tgt_encodings])
    inputs = NmtPair(src_token_ids, src_mask, tgt_token_ids[:, :-1], tgt_mask[:, :-1])
    labels = tgt_token_ids[:, 1:]
    return inputs, labels

batch_size = 32
nmt_train_loader = DataLoader(nmt_train_set, batch_size=batch_size, collate_fn=nmt_collate_fn, shuffle=True)
nmt_valid_loader = DataLoader(nmt_valid_set, batch_size=batch_size, collate_fn=nmt_collate_fn)
nmt_test_loader = DataLoader(nmt_test_set, batch_size=batch_size, collate_fn=nmt_collate_fn)

In [21]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [47]:
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

class NmtModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=512, pad_id=0, hidden_dim=512, n_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=pad_id)
        self.encoder = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.decoder = nn.GRU(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True)
        self.output = nn.Linear(hidden_dim, vocab_size)

    def forward(self, pair):
        src_embeddings = self.embed(pair.src_token_ids)
        tgt_embeddings = self.embed(pair.tgt_token_ids)
        src_lengths = pair.src_mask.sum(dim=1)
        src_packed = pack_padded_sequence(src_embeddings, lengths=src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, hidden_states = self.encoder(src_packed)
        outputs, _ = self.decoder(tgt_embeddings, hidden_states)
        return self.output(outputs).permute(0, 2, 1)

torch.manual_seed(42)
vocab_size = nmt_tokenizer.get_vocab_size()
nmt_model = NmtModel(vocab_size).to(device)
print(nmt_model)

NmtModel(
  (embed): Embedding(10000, 512, padding_idx=0)
  (encoder): GRU(512, 512, num_layers=2, batch_first=True)
  (decoder): GRU(512, 512, num_layers=2, batch_first=True)
  (output): Linear(in_features=512, out_features=10000, bias=True)
)


In [48]:
xentropy = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(nmt_model.parameters(), lr=0.005)

In [51]:
num_epochs = 2
torch.manual_seed(1)
for epoch in range(num_epochs):
    nmt_model.train()
    running_loss = 0.0
    idx = 0
    for seq_batch_pair, target_batch in nmt_train_loader:
        seq_batch_pair, target_batch = seq_batch_pair.to(device), target_batch.to(device)
        
        optimizer.zero_grad()
        pred_logits = nmt_model(seq_batch_pair)
        loss = xentropy(pred_logits, target_batch)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        idx += 1
        if idx >= 500:
            break
    
    epoch_loss = running_loss / len(nmt_train_loader)
    print(f'Epoch {epoch} loss: {loss:.4f}')

Epoch 0 loss: 5.4754
Epoch 1 loss: nan


In [52]:
def translate(model, src_text, max_length=20, pad_id=0, eos_id=3):
    tgt_text = ""
    token_ids = []
    for index in range(max_length):
        batch, _ = nmt_collate_fn([{"translation": {"en": src_text, "es": tgt_text}}])
        with torch.no_grad():
            Y_logits = model(batch.to(device))
            Y_token_ids = Y_logits.argmax(dim=1)  # find the best token IDs
            next_token_id = Y_token_ids[0, index]  # take the last token ID

        next_token = nmt_tokenizer.id_to_token(next_token_id)
        tgt_text += " " + next_token
        if next_token_id == eos_id:
            break
    return tgt_text

nmt_model.eval()
print(translate(nmt_model, "I like soccer."))

 <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
