In [65]:
# Text text processing library
import torchtext
from torchtext import data
from torchtext import datasets
from torchtext.vocab import Vectors
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import itertools as it
# from models import *
# from helpers import *
# import main
import matplotlib.pyplot as plt
import spacy
import time
MAX_LEN = 20
MIN_FREQ = 5

In [6]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(text)]

def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

BOS_WORD = '<s>'
EOS_WORD = '</s>'
DE = data.Field(tokenize=tokenize_de)

# only target needs BOS/EOS:
EN = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, eos_token = EOS_WORD) 

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                         filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                         len(vars(x)['trg']) <= MAX_LEN)

In [13]:
DE.build_vocab(train.src, min_freq=MIN_FREQ)
EN.build_vocab(train.trg, min_freq=MIN_FREQ)

train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=32, device=-1,
                                                  repeat=False, sort_key=lambda x: len(x.src))

In [44]:
batch = next(iter(train_iter))
idx = 1
print("Source")
print(' '.join([DE.vocab.itos[w] for w in batch.src.data[:,idx]]))
print("Target")
print(' '.join([EN.vocab.itos[w] for w in batch.trg.data[:,idx]]))

Source
Ein Stück <unk> an einer Baustelle in Italien .
Target
<s> Just a piece of barrier tape construction stuff in Italy . </s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>


In [58]:
len(EN.vocab), len(DE.vocab)

(11560, 13353)

In [101]:
train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=32, device=-1,
                                                  repeat=False, sort_key=lambda x: len(x.src))
bs_encoder = BaseEncoder(DE, hidden_size=100, num_layers=2, word_features=100)
bs_decoder = BaseDecoder(EN, hidden_size=100, num_layers=2, word_features=100)
trainer = NMTTrainer([bs_encoder, bs_decoder], DE, EN, lrn_rate=0.7)
trainer.train(train_iter, verbose=True)

CUDA is unavailable...
TEMP:  186.827 10.651452105461175
TEMP:  180.888 6.730905208584138
TEMP:  183.468 44.46183448259382
TEMP:  126.666 7.345000082843346
TEMP:  130.154 4.384713661335517
TEMP:  155.419 8.643194848437062
TEMP:  217.35 74.59255437048532
TEMP:  110.027 25.673868561420853
TEMP:  80.7346 13.579186832739499
TEMP:  147.5 12.46200157549876
TEMP:  144.159 24.01481337725272
TEMP:  164.856 5.799859776384146
TEMP:  149.503 22.16246598622003
TEMP:  147.356 14.671047749314548
TEMP:  129.251 22.714226378569684
TEMP:  126.423 12.826643204674701
TEMP:  64.7871 9.312281941284146
TEMP:  173.61 28.47889382610767
TEMP:  97.2872 8.316522312129257
TEMP:  65.3315 5.6288058660925016
TEMP:  106.872 28.752997502102012
TEMP:  111.227 8.045418292919813
TEMP:  152.58 20.316198119053688
TEMP:  149.412 5.2775916057064665
TEMP:  100.491 11.567283108117406
TEMP:  89.2133 18.87042391690916
TEMP:  139.211 9.859042851986292
TEMP:  118.618 22.723975982103127
TEMP:  148.696 6.534788367145107
TEMP:  111.71

KeyboardInterrupt: 

In [100]:
class EmbeddingsLM(nn.Module):
    def __init__(self, TEXT, **kwargs):
        super(EmbeddingsLM, self).__init__()
        # Initialize dropout
        self.dropout_prob = kwargs.get('dropout', 0.0)
        self.dropout = nn.Dropout(self.dropout_prob)
        
        # V is size of vocab, D is dim of embedding
        self.V = len(TEXT.vocab)
        max_embed_norm = kwargs.get('max_embed_norm', None)
        self.D = kwargs.get('word_features', 1000)
        self.embeddings = nn.Embedding(self.V, self.D, max_norm=max_embed_norm)

class BaseEncoder(EmbeddingsLM):
    def __init__(self, TEXT, **kwargs):
        super(BaseEncoder, self).__init__(TEXT, **kwargs)
        self.hidden_size = kwargs.get('hidden_size', 1000)
        self.num_layers = kwargs.get('num_layers', 4)
        self.lstm = nn.LSTM(input_size=self.D, hidden_size=self.hidden_size,
                            num_layers=self.num_layers,
                            dropout=self.dropout_prob, batch_first=True)
        
    def forward(self, input_tsr, hidden):
        # [batch_sz, sent_len, D]:
        embedded_tsr = self.embeddings(input_tsr)

        # output is [batch, sent_len, hidden_size]
        output, hidden = self.lstm(embedded_tsr, hidden)
        
        # TODO: perhaps add dropout to output
        return output, hidden

class BaseDecoder(BaseEncoder):
    def __init__(self, TEXT, **kwargs):
        super(BaseDecoder, self).__init__(TEXT, **kwargs)
        # V is the size of the vocab, which is what we're predicting
        # (it's also used as input through the embedding)
        self.num_context = kwargs.get('num_context', 1)
        # For now assume that encoder and decoder have same hidden size
        blowup = self.num_context * self.num_layers + 1
        self.out_linear = nn.Linear(blowup * self.hidden_size, self.V)

    # Context is a tuple (h_T, c_T) of hidden and cell states from
    # last time step of encoder
    def forward(self, input_tsr, hidden, context):
        # [batch_sz, sent_len, D] : note that sent_len may be 1 if we
        # feed in each word at a time!
        embedding = self.embeddings(input_tsr)
        embedding = F.relu(embedding)
        output, hidden = self.lstm(embedding, hidden)

        if self.num_context:
            # We get lucky that hidden is stored as (h,c), 
            # so hidden (not cell) first
            context_tsr = torch.cat(context[:self.num_context])
            batch_sz = context_tsr.size(1)
            sent_len = output.size(1)
            # [batch_sz, 1, hidden_size * num_context]
            context_tsr = context_tsr.permute(1,0,2).contiguous().view(batch_sz, 1, -1)
            context_tsr = context_tsr.expand(-1, sent_len, -1)
            # [batch_sz, sent_len, hidden_sz * (num_context + 1)]
            output = torch.cat((output, context_tsr), dim=2)

        # output is now [batch, sent_len, V]:
        output = self.out_linear(output)
        output = F.log_softmax(output, dim=2)
        return output, hidden



In [102]:
class NMTModelUser(object):
    # Models is a list [Encoder, Decoder]
    def __init__(self, models, TEXT_SRC, TEXT_TRG, **kwargs):
        self._TEXT_SRC = TEXT_SRC
        self._TEXT_TRG = TEXT_TRG
        self.models = models
        self.cuda = kwargs.get('cuda', True) and \
                    torch.cuda.is_available()
        if self.cuda:
            print('Using CUDA...')
        else:
            print('CUDA is unavailable...')

    def get_src_and_trg(self, batch):
        src = torch.t(batch.src.data).contiguous()
        trg = torch.t(batch.trg.data).contiguous()
        return (src, trg)

    def zeros_hidden(self, batch_sz, model_num):
        return torch.zeros(self.models[model_num].num_layers, batch_sz,
                           self.models[model_num].hidden_size)

    # Ok to have self.prev_hidden apply to encoder then decoder since
    # encoder all ends before decoder starts
    def prepare_hidden(self, batch_sz, zero_out=True, model_num=0):
        if (not self.prev_hidden is None) and (not zero_out):
            pre_hidden = self.prev_hidden
        else:
            pre_hidden = (self.zeros_hidden(batch_sz, model_num) \
                          for i in range(2))
        if self.cuda:
            pre_hidden = tuple(t.cuda() for t in pre_hidden)
        return tuple(autograd.Variable(t) for t in pre_hidden)

    # kwargs can contain zero_out, model_num for prepare_hidden
    def prepare_model_inputs(self, batch, **kwargs):
        if self.cuda:
            src, trg = tuple(t.cuda() for t in self.get_src_and_trg(batch))
        else:
            src, trg = self.get_src_and_trg(batch)

        # TODO: can comment this out (assuming it passes)
        assert batch.src.size(1) == batch.trg.size(1)
        var_hidden = self.prepare_hidden(batch.src.size(1), **kwargs)

        var_src = autograd.Variable(src)
        var_trg = autograd.Variable(trg)

        return (var_src, var_trg, var_hidden)

    def init_epoch(self):
        self.prev_hidden = None

    # Assume log_probs is [batch_sz, sent_len, V], output is
    # [batch_sz, sent_len]
    @staticmethod
    def nll_loss(log_probs, output, **kwargs):
        sent_len = log_probs.size(1)
        log_probs_rshp = log_probs.view(-1, log_probs.size(2))
        output_rshp = output.view(-1)
        return F.nll_loss(log_probs_rshp, output_rshp, **kwargs) * \
            sent_len

class NMTEvaluator(NMTModelUser):
    def __init__(self, models, TEXT_SRC, TEXT_TRG, **kwargs):
        super(NMTEvaluator, self).__init__(models, TEXT_SRC, TEXT_TRG,
                                           **kwargs)

    def evaluate(self, test_iter, num_iter=None):
        start_time = time.time()
        for model in self.models:
            model.eval()

        for i,batch in enumerate(test_iter):
            # var_src, var_trg are [batch_sz, sent_len]
            var_src, var_trg, var_hidden = self.prepare_model_inputs(batch, zero_out=True,
                                                                     model_num=0)

            # TODO: implement beam search!

    
class NMTTrainer(NMTModelUser):
    def __init__(self, models, TEXT_SRC, TEXT_TRG, **kwargs):
        super(NMTTrainer, self).__init__(models, TEXT_SRC, TEXT_TRG, **kwargs)

        self.use_attention = kwargs.get('attention', False)
        self.base_lrn_rate = kwargs.get('lrn_rate', 0.1)
        self.optimizer_type = kwargs.get('optimizer', optim.SGD)
        self.optimizers = [self.optimizer_type(filter(lambda p : p.requires_grad,
                                                      model.parameters()),
                                               lr = self.base_lrn_rate) for \
                           model in self.models]

        self.lr_decay_opt = kwargs.get('lrn_decay', 'none')
        # TODO: setup for lr decay

        self.clip_norm = kwargs.get('clip_norm', 10)
        self.init_lists()
        if self.cuda:
            for model in self.models:
                model.cuda()

    def init_lists(self):
        self.training_losses = list()
        self.training_norms = list()
        self.val_prefs = list()

    def get_loss_data(self, loss):
        if self.cuda:
            return loss.data.cpu().numpy()[0]
        else:
            return loss.data.numpy()[0]

    def make_recordings(self, loss, norm):
        self.training_norms.append(norm)
        self.training_losses.append(loss)

    def clip_norms(self):
        # Clip grad norm after backward but before step
        if self.clip_norm > 0:
            parameters = tuple()
            for model in self.models:
                parameters += tuple(model.parameters())
                
            # Norm clipping: returns a float
            norm = nn.utils.clip_grad_norm(
                parameters, self.clip_norm)
        else:
            norm = -1
        return norm

    def train_batch(self, batch, **kwargs):
        for model in self.models:
            model.zero_grad()

        # var_src, var_trg are [batch_sz, sent_len]
        var_src, var_trg, var_hidden = self.prepare_model_inputs(batch, zero_out=True,
                                                                 model_num=0)

        # For attention, will use enc_output (not otherwise)
        enc_output, enc_hidden = self.models[0](var_src, var_hidden)
        self.prev_hidden = enc_hidden
        if self.use_attention:
            raise NotImplementedError('Attention not yet implemented!')
        else:
            # Using real words as input. Use prev_hidden both to
            # initialize hidden state (the first time) and as context
            # vector
            dec_output, dec_hidden = self.models[1](var_trg, self.prev_hidden,
                                                    enc_hidden)
            self.prev_hidden = dec_hidden

            loss = self.nll_loss(dec_output, var_trg)

        loss.backward()

        # norms must be clipped after backward but before step
        norm = self.clip_norms()

        loss_data = self.get_loss_data(loss)
        if kwargs.get('verbose', False):
            print('TEMP: ', loss_data, norm)
            self.make_recordings(loss_data, norm)

        for optimizer in self.optimizers:
            optimizer.step()

        # Return loss and norm (before gradient step)
        return loss_data, norm

    def init_parameters(self):
        for model in self.models:
            for p in model.parameters():
                p.data.uniform_(-0.05, 0.05)

    def train(self, torch_train_iter, le=None, val_iter=None, **kwargs):
        self.init_lists()
        start_time = time.time()
        self.init_parameters()
        torch_train_iter.init_epoch()
        for epoch in range(kwargs.get('num_iter', 100)):
            self.init_epoch()
            for model in self.models:
                model.train()

            # TODO: LR decay3
            train_iter = iter(torch_train_iter)

            for batch in train_iter:
                res_loss, res_norm = self.train_batch(batch, **kwargs)

            if epoch % kwargs.get('skip_iter', 1) == 0:
                if not kwargs.get('verbose', False):
                    self.make_recordings(res_loss, res_norm)

            print('Epoch %d, loss: %f, norm: %f, elapsed: %f, lrn_rate: %f' \
                  % (epoch, np.mean(self.training_losses[-10:]),
                     np.mean(self.training_norms[-10:]),
                     time.time() - start_time,
                     self.base_lrn_rate)) #  * self.lambda_lr(epoch)))
                    
            
            if (not le is None) and (not val_iter is None):
                self.val_perfs.append(le.evaluate(val_iter))
                print('Validation set metric: %f' % \
                      self.val_perfs[-1])

        if len(self.val_perfs) >= 1:
            print('FINAL VAL PERF', self.val_perfs[-1])
            return self.val_perfs[-1]
        return -1


In [79]:
a = (1,2)
a[:1] + (0,)

(1, 0)