# 7. Machine translation and advanced recurrent LSTMs and GRUs

* http://web.stanford.edu/class/cs224n/lectures/cs224n-2017-lecture9.pdf
* https://arxiv.org/pdf/1406.1078.pdf

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter, OrderedDict
import nltk
from copy import deepcopy
import os
flatten = lambda l: [item for sublist in l for item in sublist]

In [2]:
USE_CUDA = torch.cuda.is_available()

FloatTensor = torch.cuda.FloatTensor if USE_CUDA else torch.FloatTensor
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor
ByteTensor = torch.cuda.ByteTensor if USE_CUDA else torch.ByteTensor

In [3]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch

In [4]:
def pad_to_batch(batch,x_to_ix,y_to_ix):
    x,y = list(zip(*batch))
    max_x = max([s.size(1) for s in x])
    max_y = max([s.size(1) for s in y])
    x_p,y_p=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<max_x:
            x_p.append(torch.cat([x[i],Variable(LongTensor([x_to_ix['<PAD>']]*(max_x-x[i].size(1)))).view(1,-1)],1))
        else:
            x_p.append(x[i])
        if y[i].size(1)<max_y:
            y_p.append(torch.cat([y[i],Variable(LongTensor([y_to_ix['<PAD>']]*(max_y-y[i].size(1)))).view(1,-1)],1))
        else:
            y_p.append(y[i])
    return torch.cat(x_p),torch.cat(y_p)

In [5]:
def prepare_sequence(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    return Variable(LongTensor(idxs))

### Data load and Preprocessing 

Translation data is too large to process using my computer. And It takes too much time with naive softmax for large vocab. So I use smallchat data collecting from <a href="https://dbd-challenge.github.io/dbdc3/datasets">here</a>. Maybe we'll use wmt16 data later with attention and sampled softmax.

In [6]:
# source_corpus = open('../dataset/wmt16_de_en/train.tok.en','r',encoding='utf-8').readlines()
# target_corpus = open('../dataset/wmt16_de_en/train.tok.de','r',encoding='utf-8.readlines()

In [6]:
from nltk.tokenize import RegexpTokenizer
tknz = RegexpTokenizer('\w+|\$[\d\.]+|\S+')

In [46]:
data = open('../dataset/smallchat.txt').readlines()

In [47]:
source_corpus = [d.split('\t')[0].lower() for d in data]
target_corpus =  [d.split('\t')[1][:-1].lower() for d in data]

In [48]:
print(len(source_corpus)==len(target_corpus))
print(len(source_corpus))

True
4172


In [49]:
%%time
X_r,y_r=[],[] # raw

for parallel in zip(source_corpus,target_corpus):
    if parallel[1]=='\n':continue
    so,ta = parallel[0], parallel[1]
    if so.strip()=="" or ta.strip()=="": continue
    X_r.append(tknz.tokenize(so))
    y_r.append(tknz.tokenize(ta))

CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 23.1 ms


### Build Vocab

In [50]:
VOCAB_LIMIT = 15000

In paper, they limit the source and target vocabulary to the most frequent 15,000 words. <br>
There are some problems of softmax operation's complexity to large target. You should determine this hyper-param in practice. 
We'll deal with this large vocab problem later.

In [12]:
# source_vocab_count = Counter(flatten(X_r))
# source_vocab, _ = list(zip(*source_vocab_count.most_common()[:VOCAB_LIMIT]))

# target_vocab_count = Counter(flatten(y_r))
# target_vocab, _  = list(zip(*target_vocab_count.most_common()[:VOCAB_LIMIT]))
# print(len(source_vocab),len(target_vocab))

In [13]:
# source2index = {'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
# for vo in source_vocab:
#     source2index[vo]=len(source2index)
# index2source = {v:k for k,v in source2index.items()}

# target2index = {'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
# for vo in target_vocab:
#     target2index[vo]=len(target2index)
# index2target = {v:k for k,v in target2index.items()}

In [51]:
vocab_count = Counter(flatten(X_r)+flatten(y_r))
vocab, _ = list(zip(*vocab_count.most_common()[:VOCAB_LIMIT]))

In [52]:
len(vocab)

4014

In [53]:
word2index={'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
for vo in vocab:
    if vo not in word2index.keys():
        word2index[vo]=len(word2index)
index2word = {v:k for k,v in word2index.items()}

In [54]:
%%time
X_p,y_p=[],[]

temp = list(zip(X_r,y_r))
random.shuffle(temp)
train_p = temp[:int(len(temp)*0.9)]
test_data = temp[int(len(temp)*0.9):]

for so,ta in train_p:
    X_p.append(prepare_sequence(so,word2index).view(1,-1))
    y_p.append(prepare_sequence(ta,word2index).view(1,-1))
    
train_data = list(zip(X_p,y_p))

CPU times: user 196 ms, sys: 12 ms, total: 208 ms
Wall time: 198 ms


### Modeling 

In [55]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, n_layers=1):
        super(Encoder, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
    
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.n_layers*2,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
#         self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0)
#         self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0)
    
    def forward(self, inputs, input_masking=None):
        """
        inputs : B,T (LongTensor)
        input_masking : B,T (ByteTensor) if you don't use zero-padding, leave it at that
        """
        hidden = self.init_hidden(inputs)
        
        embedded = self.embedding(inputs)
        output, hidden = self.gru(embedded, hidden)

        real_context=[]
        
        if type(input_masking)==torch.autograd.variable.Variable:
            for i,o in enumerate(output): # B,T,D
                real_length = input_masking[i].data.tolist().count(0) # real length
                real_context.append(o[real_length-1])
            hidden = torch.cat(real_context).view(inputs.size(0),-1).unsqueeze(1)
        else:
            hidden = torch.cat(hidden,1).unsqueeze(1)
        
        return output, hidden

In [56]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, n_layers=1,dropout_p=0.3):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        # Define the layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.dropout = nn.Dropout(dropout_p)
        
        self.gru = nn.GRU(embedding_size+hidden_size, hidden_size, n_layers,batch_first=True)
        self.linear = nn.Linear(hidden_size*2, input_size)
    
    def init_hidden(self,inputs):
        hidden = Variable(torch.zeros(self.n_layers,inputs.size(0),self.hidden_size))
        return hidden.cuda() if USE_CUDA else hidden
    
    
    def init_weight(self):
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
#         self.gru.weight_hh_l0 = nn.init.xavier_uniform(self.gru.weight_hh_l0)
#         self.gru.weight_ih_l0 = nn.init.xavier_uniform(self.gru.weight_ih_l0)
        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
    
    def forward(self, inputs, context,max_length,training=False):
        """
        inputs : B,1 (LongTensor, START SYMBOL)
        context : B,1,H (FloatTensor, Last encoder hidden state)
        max_length : int, max length to decode
        training : bool, this is because adapt dropout only training step.
        """
        # Get the embedding of the current input word
        embedded = self.embedding(inputs)
        hidden = self.init_hidden(inputs)
        if training:
            embedded = self.dropout(embedded)
        
        decode=[]
        # Apply GRU to the output so far
        for i in range(max_length):

            _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c)
            concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c)
            score = self.linear(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            decoded = softmaxed.max(1)[1]
            embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1}
            if training:
                embedded = self.dropout(embedded)
            
        #  column-wise concat, reshape!!
        scores = torch.cat(decode,1)
        return scores.view(inputs.size(0)*max_length,-1)

### Train 

It takes for a while

In [63]:
STEP=100
BATCH_SIZE = 64
EMBEDDING_SIZE = 50
HIDDEN_SIZE = 128
LR = 1e-3

In [64]:
encoder = Encoder(len(word2index),EMBEDDING_SIZE,HIDDEN_SIZE,2)
decoder = Decoder(len(word2index),EMBEDDING_SIZE,HIDDEN_SIZE*2)
encoder.init_weight()
decoder.init_weight()

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optimizer = optim.Adam(encoder.parameters(),lr=LR)
dec_optimizer = optim.Adam(decoder.parameters(),lr=LR)

In [66]:
for step in range(STEP):
    losses=[]
    for i,batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        inputs,targets = pad_to_batch(batch,word2index,word2index)
        
        input_mask = torch.cat([Variable(ByteTensor(tuple(map(lambda s: s ==0, t.data))),volatile=False) for t in inputs]).view(inputs.size(0),-1)
        start_decode = Variable(LongTensor([[word2index['<SOS>']]*targets.size(0)])).transpose(0,1)
        
        encoder.zero_grad()
        decoder.zero_grad()
        output, hidden_c = encoder(inputs,input_mask)
        
        preds = decoder(start_decode,hidden_c,targets.size(1),True)
                                
        loss = loss_function(preds,targets.view(-1))
        losses.append(loss.data.cpu().numpy()[0] if USE_CUDA else loss.data.numpy()[0] )
        loss.backward()
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 0.5) # gradient clipping
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 0.5) # gradient clipping
        enc_optimizer.step()
        dec_optimizer.step()
    
    if step % 10==0:
        print("[%d/%d] mean_loss : %0.2f" %(step,STEP,np.mean(losses)))
        torch.save(decoder.state_dict(),os.path.join('../','decoder.pkl'))
        torch.save(encoder.state_dict(),os.path.join('../', 'encoder.pkl'))
        losses=[]

[0/100] mean_loss : 6.03
[10/100] mean_loss : 4.33
[20/100] mean_loss : 3.19
[30/100] mean_loss : 2.37
[40/100] mean_loss : 1.73
[50/100] mean_loss : 1.25
[60/100] mean_loss : 0.93
[70/100] mean_loss : 0.74
[80/100] mean_loss : 0.62
[90/100] mean_loss : 0.55


### Test

In [68]:
while(1):
    try:
        test = input()
        input_ = prepare_sequence(tknz.tokenize(test.lower()),word2index).view(1,-1)
        input_mask = Variable(ByteTensor(tuple(map(lambda s: s ==0, input_[0].data)))).view(1,-1)
        start_decode = Variable(LongTensor([[word2index['<SOS>']]*1])).transpose(0,1)
        o, hidden_c = encoder(input_,input_mask)
        pred = decoder(start_decode,hidden_c,10)
        pred = pred.max(1)[1].data.cpu().tolist() if USE_CUDA else pred.max(1)[1].data.tolist()

        print('USER : '+ test)
        # print(' '.join(test[1]))
        print('BOT : '+' '.join([index2word[t] for t in pred])) 
    except KeyboardInterrupt:
        break

hi
USER : hi
BOT : i can t know what i don talking about .
do you know kimchi?
USER : do you know kimchi?
BOT : hell ! ! ! the the kimchi !! victoria !
do you know Jisung park?
USER : do you know Jisung park?
BOT : yes . i idea to idea ? about idea ?
hmm
USER : hmm
BOT : it is a matter . i just 't saying .
what is your name
USER : what is your name
BOT : what me t understand :( . i me to discuss
fuck you
USER : fuck you
BOT : yes are are name ? are are are about discuss
hell
USER : hell
BOT : second , go thermodynamics ! the vain , of .
you stupid
USER : you stupid
BOT : i don t know what . i would like .
have a good night
USER : have a good night
BOT : how about we to something to both . about about
fuck..
USER : fuck..
BOT : yeah . i about . your . about . .
are you chatbot?
USER : are you chatbot?
BOT : no ,,!! i 'm... . they by students . i
mm
USER : mm
BOT : you you make the first to your . the iron
bye
USER : bye
BOT : here , ! the ! , . , . ,


because of noisy data, It is hard to converge