# 7. Neural Machine Translation and Models with Attention

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture9.pdf
* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture10.pdf
* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture11.pdf
* https://arxiv.org/pdf/1409.0473.pdf
* http://www.aclweb.org/anthology/P15-1001
* https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation-batched.ipynb
* http://www.manythings.org/anki/

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
import os
import re
import unicodedata
flatten = lambda l: [item for sublist in l for item in sublist]

from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence

In [2]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [4]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [5]:
def pad_to_batch(batch,x_to_ix,y_to_ix):
    
    sorted_batch =  sorted(batch, key=lambda b:b[0].size(1),reverse=True) # sort by len
    x,y = list(zip(*sorted_batch))
    max_x = max([s.size(1) for s in x])
    max_y = max([s.size(1) for s in y])
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([x_to_ix['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        if y[i].size(1)<max_y:
            y_p.append(torch.cat([y[i],Variable(LongTensor([y_to_ix['<PAD>']]*(max_y-y[i].size(1)))).view(1,-1)],1))
        else:
            y_p.append(y[i])
        
    input_var = torch.cat(x_p)
    target_var = torch.cat(y_p)
    input_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in input_var]
    target_len = [list(map(lambda s: s ==0, t.data)).count(False) for t in target_var]
    
    return input_var, target_var, input_len, target_len

In [6]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

### Data load and Preprocessing 

In [7]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
    s = unicode_to_ascii(s.lower().strip())
    s = re.sub(r"([,.!?])", r" \1 ", s)
    s = re.sub(r"[^a-zA-Z,.!?]+", r" ", s)
    s = re.sub(r"\s+", r" ", s).strip()
    return s

In [8]:
corpus = open('./dataset/eng-fra.txt','r').readlines()

In [9]:
MIN_LENGTH=3
MAX_LENGTH=25

In [10]:
%%time
X_r,y_r=[],[] # raw

for parallel in corpus:
    so,ta = parallel[:-1].split('\t')
    if so.strip()=="" or ta.strip()=="": continue
    
    normalized_so = normalize_string(so).split()
    normalized_ta = normalize_string(ta).split()
    
    if len(normalized_so)>=MIN_LENGTH and len(normalized_so)<=MAX_LENGTH \
    and len(normalized_ta)>=MIN_LENGTH and len(normalized_ta)<=MAX_LENGTH:
        X_r.append(normalized_so)
        y_r.append(normalized_ta)
    

print(len(X_r),len(y_r))
print(X_r[0],y_r[0])

26876 26876
['go', '.'] ['va', '!']
CPU times: user 4.48 s, sys: 4 ms, total: 4.49 s
Wall time: 4.49 s


### Build Vocab

In [11]:
source_vocab = list(set(flatten(X_r)))
target_vocab = list(set(flatten(y_r)))
print(len(source_vocab),len(target_vocab))

4432 7178


In [12]:
# source_vocab_count = Counter(flatten(X_r))
# source_vocab, _ = list(zip(*source_vocab_count.most_common()))

# target_vocab_count = Counter(flatten(y_r))
# target_vocab, _  = list(zip(*target_vocab_count.most_common()))
# print(len(source_vocab),len(target_vocab))

4432 7178


In [13]:
source2index = {'<PAD>':0,'<UNK>':1,'<s>':2,'</s>':3}
for vo in source_vocab:
    if vo not in source2index.keys():
        source2index[vo]=len(source2index)
index2source = {v:k for k,v in source2index.items()}

target2index = {'<PAD>':0,'<UNK>':1,'<s>':2,'</s>':3}
for vo in target_vocab:
    if vo not in target2index.keys():
        target2index[vo]=len(target2index)
index2target = {v:k for k,v in target2index.items()}

In [15]:
%%time
X_p,y_p=[],[]

for so,ta in zip(X_r,y_r):
    X_p.append(prepare_sequence(['<s>']+so+['</s>'],source2index).view(1,-1))
    y_p.append(prepare_sequence(ta+['</s>'],target2index).view(1,-1))
    
train_data = list(zip(X_p,y_p))

CPU times: user 1.96 s, sys: 252 ms, total: 2.21 s
Wall time: 2.23 s


### Modeling 

In [16]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, n_layers=1,bidirec=False):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        
        if bidirec:
            self.n_direction = 2 
            self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
        else:
            self.n_direction = 1
            self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True)
    
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.n_layers*self.n_direction,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
#         self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0)
#         self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0)
    
    def forward(self, inputs, input_lengths):
        """
        inputs : B,T (LongTensor)
        input_lengths : real lengths of input batch (list)
        """
        hidden = self.init_hidden(inputs)
        
        embedded = self.embedding(inputs)
        packed = pack_padded_sequence(embedded, input_lengths,batch_first=True)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True) # unpack (back to padded)
        
        if self.n_layers>1:
            if self.n_direction==2:
                hidden = hidden[-2:]
            else:
                hidden = hidden[-1]
        
        return outputs, torch.cat(hidden,1).unsqueeze(1)

In [17]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers=1,dropout_p=0.1):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Define the layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        
        self.gru = nn.GRU(embedding_size+hidden_size, hidden_size, n_layers,batch_first=True)
        self.linear = nn.Linear(hidden_size*2, input_size)
        self.attn = nn.Linear(self.hidden_size,self.hidden_size) # Attention
        
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.n_layers,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
#         self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0)
#         self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0)
        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
        self.attn.weight = nn.init.xavier_uniform(self.attn.weight)
        self.attn.bias.data.fill_(0)
    
    def Attention(self, hidden, encoder_outputs, encoder_maskings):
        """
        hidden : 1,B,D
        encoder_outputs : B,T,D
        encoder_maskings : B,T # ByteTensor
        """
        
        hidden = hidden.squeeze(0).unsqueeze(2)  # 히든 : (1,B,D) -> (B,D,1)
        
        batch_size = encoder_outputs.size(0) # B
        max_len = encoder_outputs.size(1) # T
        energies = self.attn(encoder_outputs.contiguous().view(batch_size*max_len,-1)) # B*T,D -> B*T,D
        energies = energies.view(batch_size,max_len,-1) # B,T,D
        attn_energies = energies.bmm(hidden).transpose(1,2).squeeze(1) # B,T,D * B,D,1 --> B,1,T
        
        if isinstance(encoder_maskings,torch.autograd.variable.Variable):
            attn_energies = attn_energies.masked_fill(encoder_maskings,-1e12) # PAD masking

        alpha = F.softmax(attn_energies) # B,T
        alpha = alpha.unsqueeze(1) # B,1,T
        context = alpha.bmm(encoder_outputs) # B,1,T * B,T,D => B,1,D
        
        return context # B,1,D
    
    
    def forward(self,inputs,context,max_length,encoder_outputs,encoder_maskings=None,training=False):
        """
        inputs : B,1 (LongTensor, START SYMBOL)
        context : B,1,D (FloatTensor, Last encoder hidden state)
        encoder_outputs : B,T,D
        encoder_maskings : B,T # ByteTensor
        max_length : int, max length to decode
        training : bool, this is because adapt dropout only training step.
        """
        # Get the embedding of the current input word
        embedded = self.embedding(inputs)
        hidden = self.init_hidden(inputs)
        if training:
            embedded = self.dropout(embedded)
        
        decode=[]
        # Apply GRU to the output so far
        for i in range(max_length):

            _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
            concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
            score = self.linear(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            decoded = softmaxed.max(1)[1]
            embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1}
            if training:
                embedded = self.dropout(embedded)
            
            # compute next context vector using attention
            context = self.Attention(hidden, encoder_outputs,encoder_maskings) 
        #  column-wise concat, reshape!!
        scores = torch.cat(decode,1)
        return scores.view(inputs.size(0)*max_length,-1)

### Train 

It takes for a while...

In [30]:
STEP=100
BATCH_SIZE = 64
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 256
LR = 0.0001
DECODER_LEARNING_RATIO=5.0

In [31]:
encoder = Encoder(len(source2index),EMBEDDING_SIZE,HIDDEN_SIZE,3,True)
decoder = Decoder(len(target2index),EMBEDDING_SIZE,HIDDEN_SIZE*2)
encoder.init_weight()
decoder.init_weight()

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optimizer = optim.Adam(encoder.parameters(),lr=LR)
dec_optimizer = optim.Adam(decoder.parameters(),lr=LR*DECODER_LEARNING_RATIO)

In [None]:
# encoder.load_state_dict(torch.load('../encoder.pkl'))
# decoder.load_state_dict(torch.load('../decoder.pkl'))

In [32]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        inputs,targets,input_lengths,target_lengths = pad_to_batch(batch,source2index,target2index)
        
        input_masks = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data))),volatile=False) for t in inputs]).view(inputs.size(0),-1)
        start_decode = Variable(LongTensor([[target2index['<s>']]*targets.size(0)])).transpose(0,1)
        #smaller_vocab = uniform_candidate_sampler(targets,200,len(word2index))
        encoder.zero_grad()
        decoder.zero_grad()
        output, hidden_c = encoder(inputs,input_lengths)
        
        preds = decoder(start_decode,hidden_c,targets.size(1),output,input_masks,True)
                                
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0] )
        loss.backward()
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 0.5) # gradient clipping
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 0.5) # gradient clipping
        enc_optimizer.step()
        dec_optimizer.step()

    
#     if i % 100==0:
    print("[%d/%d] mean_loss : %0.2f" %(step,STEP,np.mean(losses)))
    torch.save(decoder.state_dict(),os.path.join('../','decoder.pkl'))
    torch.save(encoder.state_dict(),os.path.join('../', 'encoder.pkl'))
    losses=[]

#     if (step+1) % 10 == 0:
#         LR = LR/2
#         enc_optimizer = optim.Adam(encoder.parameters(),lr=LR)
#         dec_optimizer = optim.Adam(decoder.parameters(),lr=LR)

[0/100] mean_loss : 4.54
[1/100] mean_loss : 3.74
[2/100] mean_loss : 3.41
[3/100] mean_loss : 3.18
[4/100] mean_loss : 3.00
[5/100] mean_loss : 2.85
[6/100] mean_loss : 2.71
[7/100] mean_loss : 2.57
[8/100] mean_loss : 2.45
[9/100] mean_loss : 2.33
[10/100] mean_loss : 2.22
[11/100] mean_loss : 2.12
[12/100] mean_loss : 2.02
[13/100] mean_loss : 1.93
[14/100] mean_loss : 1.84
[15/100] mean_loss : 1.76
[16/100] mean_loss : 1.68
[17/100] mean_loss : 1.60
[18/100] mean_loss : 1.53
[19/100] mean_loss : 1.47
[20/100] mean_loss : 1.40
[21/100] mean_loss : 1.34
[22/100] mean_loss : 1.28
[23/100] mean_loss : 1.22
[24/100] mean_loss : 1.17
[25/100] mean_loss : 1.12
[26/100] mean_loss : 1.07
[27/100] mean_loss : 1.03
[28/100] mean_loss : 0.99
[29/100] mean_loss : 0.95
[30/100] mean_loss : 0.92
[31/100] mean_loss : 0.88
[32/100] mean_loss : 0.85
[33/100] mean_loss : 0.82
[34/100] mean_loss : 0.79
[35/100] mean_loss : 0.76
[36/100] mean_loss : 0.74
[37/100] mean_loss : 0.71
[38/100] mean_loss : 0

KeyboardInterrupt: 

### Test

In [41]:
test = random.choice(train_data)
input_ = test[0]
truth = test[1]
start_decode = Variable(LongTensor([[target2index['<s>']]*1])).transpose(0,1)

output, hidden = encoder(input_,[input_.size(1)])
pred = decoder(start_decode,hidden,10,output).max(1)[1]

print(' '.join([index2source[i] for i in input_.data.tolist()[0] if i not in [2,3]]))
print(' '.join([index2target[i] for i in truth.data.tolist()[0] if i not in [2,3]]))
print(' '.join([index2target[i] for i in pred.data.tolist() if i!=3]))

it looked funny .
ca avait l air drole .
ca avait l air drole .


# TODO 

* newstest data로 BLEU 측정
* Beam Search
* Sampled Softmax