# 7. Machine translation and advanced recurrent LSTMs and GRUs

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture9.pdf
* http://www.statmt.org/wmt15/translation-task.html
* https://arxiv.org/pdf/1406.1078.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
flatten = lambda l: [item for sublist in l for item in sublist]

In [9]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch):
    x,y = list(zip(*batch))
    max_x = max([s.size(1) for s in x])
    max_y = max([s.size(1) for s in y])
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([word2index['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        if y[i].size(1)<max_y:
            y_p.append(torch.cat([y[i],Variable(LongTensor([word2index['<PAD>']]*(max_y-y[i].size(1)))).view(1,-1)],1))
        else:
            y_p.append(y[i])
    return list(zip(x_p,y_p))

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

### Modeling 

In [26]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, n_layers=1):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True)
    
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.num_layers,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_normal(self.embedding.weight)
        self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0)
        self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0)
    
    def forward(self, inputs, input_masking):
        """
        inputs : B,T (LongTensor)
        input_masking : B,T (ByteTensor)
        """
        hidden = self.init_hidden(inputs)
        
        embedded = self.embedding(inputs)
        output, hidden = self.gru(embedded, hidden)

        real_context=[]
        
        for i,o in enumerate(output): # B,T,D
            real_length = input_masking[i].data.tolist().count(0) # real length
            real_context.append(o[real_length-1])
            
        return output, torch.cat(real_context).view(inputs.size(0),-1).unsqueeze(1)

In [30]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers=1,dropout_p=0.3):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Define the layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        
        self.gru = nn.GRU(embedding_size+hidden_size, hidden_size, n_layers,batch_first=True)
        self.linear = nn.Linear(embedding_size+hidden_size*2, input_size)
    
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.num_layers,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_normal(self.embedding.weight)
        self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0)
        self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0)
        self.linear.weight = nn.init.xavier_normal(self.linear.weight)
    
    def forward(self, inputs, context,max_length,training=False):
        """
        inputs : B,1 (LongTensor, START SYMBOL)
        context : B,1,H (FloatTensor, Last encoder hidden state)
        max_length : int, max length to decode
        training : bool, this is because adapt dropout only training step.
        """
        # Get the embedding of the current input word
        embedded = self.embedding(inputs)
        hidden = self.init_hidden(inputs)
        if training:
            embedded = self.dropout(embedded)
        
        decode=[]
        # Apply GRU to the output so far
        for i in range(max_length):
            
            _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
            concated = torch.cat((hidden,embedded.transpose(0,1),context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
            score = self.linear(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            decoded = softmaxed.max(1)[1]
            embedded = self.embedding(decoded) # y_{t-1}
            if training:
                embedded = self.dropout(embedded.unsqueeze(1))
            
        #  column-wise concat, reshape!!
        scores = torch.cat(decode,1)
        return scores.view(input.size(0)*max_length,-1)

### Train 

In [15]:
STEP=5
BATCH_SIZE = 64
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 1000
LR = 1e-6

In [32]:
encoder = Encoder(len(source2index),EMBEDDING_SIZE,HIDDEN_SIZE)
decoder = Decoder(len(target2index),EMBEDDING_SIZE,HIDDEN_SIZE)
if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optimizer = optim.Adam(encoder.parameters(),lr=LR)
dec_optimizer = optim.Adam(decoder.parameters(),lr=LR)

NameError: name 'source2index' is not defined

### Test 

BLEU