In [13]:
import torch
from torch import nn
from pathlib import Path
import numpy as np

torch.backends.cudnn.benchmark = True
torch.manual_seed(42)
torch.cuda.manual_seed(42)

import sys
sys.path.append("..")

from nmt.models import NMT
from nmt.datasets import read_corpus, batch_iter, Vocab
import tqdm

In [3]:
data_loc = Path("..") / "nmt" / "datasets" / "data"
en_es_data_loc = data_loc / "en_es_data"
train_data_src_path = en_es_data_loc / "train_tiny.es"
train_data_tgt_path = en_es_data_loc / "train_tiny.en"
dev_data_src_path = en_es_data_loc / "dev_tiny.es"
dev_data_tgt_path = en_es_data_loc / "dev_tiny.en"
vocab_path = data_loc / "vocab_tiny_q2.json"

In [4]:
train_src = read_corpus(train_data_src_path)
train_tgt = read_corpus(train_data_tgt_path, is_target=True)
vocab = Vocab.load(vocab_path)

In [5]:
len(vocab.src), len(vocab.tgt)

(26, 32)

In [6]:
valid_src = read_corpus(dev_data_src_path)
valid_tgt = read_corpus(dev_data_tgt_path, is_target=True)

In [7]:
BATCH_SIZE=2
MAX_EPOCH=201
SEED=42
EMBEDDING_SIZE=256
HIDDEN_SIZE=256
GRAD_CLIP=5.0
UNIFORM_INIT=0.1
USE_CHAR_DECODER=True
LEARNING_RATE=0.001

In [8]:
model = NMT(
    vocab=vocab,
    embedding_dim=EMBEDDING_SIZE,
    hidden_size=HIDDEN_SIZE,
    use_char_decoder=True
)

In [11]:
model = model.train()

In [14]:
uniform_init = UNIFORM_INIT
if np.abs(uniform_init) > 0.:
    print('uniformly initialize parameters [-%f, +%f]' %
            (uniform_init, uniform_init), file=sys.stderr)
    for p in model.parameters():
        p.data.uniform_(-uniform_init, uniform_init)

uniformly initialize parameters [-0.100000, +0.100000]


In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [16]:
%%time
for epoch in range(MAX_EPOCH):
    cum_loss = 0
    for i, (src_sents, tgt_sents) in enumerate(batch_iter((train_src, train_tgt), batch_size=BATCH_SIZE, shuffle=True)):
        optimizer.zero_grad()
        batch_size = len(src_sents)

        batch_loss = -model(src_sents, tgt_sents).sum()
        batch_loss /= batch_size
        cum_loss += batch_loss
        batch_loss.backward()

         # clip gradient
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP)
        optimizer.step()
    cum_loss /= len(train_src)
    print(f"Epoch: {str(epoch).zfill(3)} - Cumulative loss: {cum_loss}")

Epoch: 000 - Cumulative loss: 76.59217834472656
Epoch: 001 - Cumulative loss: 73.57061767578125
Epoch: 002 - Cumulative loss: 70.48408508300781
Epoch: 003 - Cumulative loss: 69.05148315429688
Epoch: 004 - Cumulative loss: 66.44526672363281
Epoch: 005 - Cumulative loss: 63.42730712890625
Epoch: 006 - Cumulative loss: 60.61967086791992
Epoch: 007 - Cumulative loss: 57.5542106628418
Epoch: 008 - Cumulative loss: 54.16157150268555
Epoch: 009 - Cumulative loss: 51.2515869140625
Epoch: 010 - Cumulative loss: 49.34196853637695
Epoch: 011 - Cumulative loss: 48.279747009277344
Epoch: 012 - Cumulative loss: 47.420223236083984
Epoch: 013 - Cumulative loss: 46.780296325683594
Epoch: 014 - Cumulative loss: 45.952064514160156
Epoch: 015 - Cumulative loss: 45.76457977294922
Epoch: 016 - Cumulative loss: 45.33715057373047
Epoch: 017 - Cumulative loss: 45.0976676940918
Epoch: 018 - Cumulative loss: 44.86919403076172
Epoch: 019 - Cumulative loss: 44.57324981689453
Epoch: 020 - Cumulative loss: 44.372322