In [1]:
import torch
from torch import nn
import numpy as np
from pathlib import Path

torch.backends.cudnn.benchmark = True
np.random.seed(42)
torch.manual_seed(42)
torch.cuda.manual_seed(42)

import sys
sys.path.append("..")

from nmt.models import NMT
from nmt.datasets import read_corpus, batch_iter, Vocab
import tqdm

In [2]:
data_loc = Path("..") / "nmt" / "datasets" / "data"
en_es_data_loc = data_loc / "en_es_data"
train_data_src_path = en_es_data_loc / "train_tiny.es"
train_data_tgt_path = en_es_data_loc / "train_tiny.en"
dev_data_src_path = en_es_data_loc / "dev_tiny.es"
dev_data_tgt_path = en_es_data_loc / "dev_tiny.en"
vocab_path = data_loc / "vocab_tiny_q2.json"

In [3]:
train_src = read_corpus(train_data_src_path)
train_tgt = read_corpus(train_data_tgt_path, is_target=True)
vocab = Vocab.load(vocab_path)

In [4]:
len(vocab.src), len(vocab.tgt)

(26, 32)

In [5]:
valid_src = read_corpus(dev_data_src_path)
valid_tgt = read_corpus(dev_data_tgt_path, is_target=True)

In [6]:
BATCH_SIZE=2
MAX_EPOCH=201
SEED=42
EMBEDDING_SIZE=256
HIDDEN_SIZE=256
GRAD_CLIP=5.0
UNIFORM_INIT=0.1
USE_CHAR_DECODER=True
LEARNING_RATE=0.001

In [7]:
model = NMT(
    vocab=vocab,
    embedding_dim=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    use_char_decoder=True
)

In [8]:
model.train()

NMT(
  (encoder): Encoder(
    (embedding): CharEmbedding(
      (char_embed): Embedding(97, 50, padding_idx=0)
      (cnn_embed): CharCNNEmbedding(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
        (maxpool): AdaptiveMaxPool1d(output_size=1)
      )
      (highway): Highway(
        (linear): Linear(in_features=256, out_features=256, bias=True)
        (gate): Linear(in_features=256, out_features=256, bias=True)
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): LSTM(256, 256, num_layers=2, bidirectional=True)
    (hidden_projection): Linear(in_features=512, out_features=256, bias=False)
    (cell_projection): Linear(in_features=512, out_features=256, bias=False)
  )
  (decoder): Decoder(
    (embedding): CharEmbedding(
      (char_embed): Embedding(97, 50, padding_idx=0)
      (cnn_embed): CharCNNEmbedding(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
        (maxpool): AdaptiveMaxPool1d(output_size=1)
      )
      (hig

In [9]:
uniform_init = UNIFORM_INIT
if np.abs(uniform_init) > 0.:
    print('uniformly initialize parameters [-%f, +%f]' %
            (uniform_init, uniform_init), file=sys.stderr)
    for p in model.parameters():
        p.data.uniform_(-uniform_init, uniform_init)

uniformly initialize parameters [-0.100000, +0.100000]


In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [11]:
%%time
for epoch in range(MAX_EPOCH):
    cum_loss = 0
    for i, (src_sents, tgt_sents) in enumerate(batch_iter((train_src, train_tgt), batch_size=BATCH_SIZE, shuffle=True)):
        optimizer.zero_grad()
        batch_size = len(src_sents)

        batch_loss = -model(src_sents, tgt_sents).sum()
        batch_loss /= batch_size
        cum_loss += batch_loss
        batch_loss.backward()

         # clip gradient
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
    cum_loss /= len(train_src)
    print(f"Epoch: {str(epoch).zfill(3)} - Cumulative loss: {cum_loss}")

CPU times: user 9min 39s, sys: 5 s, total: 9min 44s
Wall time: 51 s


In [13]:
model.train(False)

NMT(
  (encoder): Encoder(
    (embedding): CharEmbedding(
      (char_embed): Embedding(97, 50, padding_idx=0)
      (cnn_embed): CharCNNEmbedding(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
        (maxpool): AdaptiveMaxPool1d(output_size=1)
      )
      (highway): Highway(
        (linear): Linear(in_features=256, out_features=256, bias=True)
        (gate): Linear(in_features=256, out_features=256, bias=True)
      )
      (dropout): Dropout(p=0.3, inplace=False)
    )
    (encoder): LSTM(256, 256, num_layers=2, bidirectional=True)
    (hidden_projection): Linear(in_features=512, out_features=256, bias=False)
    (cell_projection): Linear(in_features=512, out_features=256, bias=False)
  )
  (decoder): Decoder(
    (embedding): CharEmbedding(
      (char_embed): Embedding(97, 50, padding_idx=0)
      (cnn_embed): CharCNNEmbedding(
        (conv): Conv1d(50, 256, kernel_size=(5,), stride=(1,))
        (maxpool): AdaptiveMaxPool1d(output_size=1)
      )
      (hig

In [14]:
example = [valid_src[0]]
example_length = [len(e) for e in example]
print(example, example_length)


[['Muchas', 'gracias', 'Chris.', 'Y', 'es', 'en', 'verdad', 'un', 'gran', 'honor', 'tener', 'la', 'oportunidad', 'de', 'venir', 'a', 'este', 'escenario', 'por', 'segunda', 'vez.', 'Estoy', 'extremadamente', 'agradecido.']] [24]


In [15]:
example_token_tensor = vocab.src.to_tensor(example, tokens=True)
example_char_tensor = vocab.src.to_tensor(example, tokens=False)

In [16]:
example_char_tensor.shape, example_token_tensor.shape

(torch.Size([24, 1, 21]), torch.Size([24, 1]))

In [17]:
enc_out, enc_state = model.encoder(example_char_tensor, example_length)

In [18]:
enc_out.shape

torch.Size([1, 24, 512])

In [19]:
dec_state = enc_state

In [20]:
y_t = vocab.tgt.to_tensor([["<s>"]], tokens=False)

In [21]:
out = torch.zeros(1, model.hidden_size)

In [22]:
out, dec_state, _ = model.decoder(y_t, enc_out, dec_state, out)

In [23]:
out = model.target_layer(out)

In [24]:
index = out.argmax().item()

In [26]:
index

3

In [25]:
vocab.tgt.to_tokens(index)

'<unk>'

## Bringing it all together

In [27]:
example_char_tensor = vocab.src.to_tensor(example, tokens=False)

In [28]:
enc_out, enc_state = model.encoder(example_char_tensor, example_length)

In [29]:
token = "<s>"
out = torch.zeros(1, model.hidden_size)
sent = []
combined_out = []
for i in range(40):
    y_t = vocab.tgt.to_tensor([[token]], tokens=False)
    out, dec_state, _ = model.decoder(y_t, enc_out, dec_state, out)
    combined_out.append(out)
    logit = model.target_layer(out)
    index = logit.argmax().item()
    if index == vocab.tgt.end_token_idx:
        break
    token = vocab.tgt.to_tokens(index)
    sent.append(token)

In [30]:
sent

['<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>',
 '<unk>']