In [41]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re  
import random
import numpy as np
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import collections
from itertools import dropwhile

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [42]:
#!pip3 install sacrebleu
from sacrebleu import corpus_bleu

In [43]:
#read in chinese-english pairs
#read in chinese-english pairs
lines_zh = open('iwslt-zh-en/train.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en = open('iwslt-zh-en/train.tok.en',encoding = 'utf-8').read().strip().split('\n')
lines_zh_test = open('iwslt-zh-en/test.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en_test = open('iwslt-zh-en/test.tok.en',encoding = 'utf-8').read().strip().split('\n')
lines_zh_val = open('iwslt-zh-en/dev.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en_val = open('iwslt-zh-en/dev.tok.en',encoding = 'utf-8').read().strip().split('\n')

In [44]:
def delect_least_common_words(list_sent, threshold = 5):
    ret_list =[]
    for x in list_sent:
        ret_list += x.split()
    ret_dic = collections.Counter(ret_list)
    for key, count in dropwhile(lambda key_count: key_count[1] >= threshold, ret_dic.most_common()):
        del ret_dic[key]
    return list(ret_dic.keys())

In [45]:
zh_words = delect_least_common_words(lines_zh)
en_words = delect_least_common_words(lines_en)

In [46]:
words_to_load = 100000
PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

with open('cc.zh.300.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+3, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    ordered_words_ft.extend(['<pad>', '<unk>', '<s>'])
    loaded_embeddings_ft[0,:] = np.zeros(300)
    loaded_embeddings_ft[1,:] = np.random.normal(size = 300)
    loaded_embeddings_ft[2,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+3, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+3
        idx2words_ft[i+3] = s[0]
        ordered_words_ft.append(s[0])
    length = len(np.setdiff1d(zh_words, ordered_words_ft))
    tmp_embeddings = np.zeros((length, 300))
    for idx, word in enumerate(np.setdiff1d(zh_words, ordered_words_ft)):
        words_ft[word] = idx+words_to_load+3
        idx2words_ft[idx+words_to_load+3] = word
        tmp_embeddings[idx, :] = np.random.normal(size = 300)
    loaded_embeddings_ft = np.concatenate((loaded_embeddings_ft, tmp_embeddings), axis = 0)
    words_ft['<pad>'] = PAD_IDX
    words_ft['<unk>'] = UNK_IDX
    words_ft['<s>'] = SOS_IDX
    idx2words_ft[PAD_IDX] = '<pad>'
    idx2words_ft[UNK_IDX] = '<unk>'
    idx2words_ft[SOS_IDX] = '<s>'

In [47]:
#English embedding
with open('wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft_en = np.zeros((words_to_load+4, 300))
    words_ft_en = {}
    idx2words_ft_en = {}
    ordered_words_ft_en = []
    ordered_words_ft_en.extend(['<pad>', '<unk>', '<s>', '</s>'])
    loaded_embeddings_ft_en[0,:] = np.zeros(300)
    loaded_embeddings_ft_en[1,:] = np.random.normal(size = 300)
    loaded_embeddings_ft_en[2,:] = np.random.normal(size = 300)
    loaded_embeddings_ft_en[3,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft_en[i+4, :] = np.asarray(s[1:])
        words_ft_en[s[0]] = i+4
        idx2words_ft_en[i+4] = s[0]
        ordered_words_ft_en.append(s[0])
    length = len(np.setdiff1d(en_words, ordered_words_ft_en))
    tmp_embeddings = np.zeros((length, 300))
    for idx, word in enumerate(np.setdiff1d(en_words, ordered_words_ft_en)):
        words_ft_en[word] = idx+words_to_load+4
        idx2words_ft_en[idx+words_to_load+4] = word
        tmp_embeddings[idx, :] = np.random.normal(size = 300)
    loaded_embeddings_ft_en = np.concatenate((loaded_embeddings_ft_en, tmp_embeddings), axis = 0)
    words_ft_en['<pad>'] = PAD_IDX
    words_ft_en['<unk>'] = UNK_IDX
    words_ft_en['<s>'] = SOS_IDX
    words_ft_en['</s>'] = EOS_IDX
    idx2words_ft_en[PAD_IDX] = '<pad>'
    idx2words_ft_en[UNK_IDX] = '<unk>'
    idx2words_ft_en[SOS_IDX] = '<s>'
    idx2words_ft_en[EOS_IDX] = '</s>'

In [6]:
#add sos and eos in each sentence
def add_sos_eos(lines):
    
    train = []
    for l in lines:
        l = '<s> ' + l + '</s>'
        train.append(l)
    return train
zh_train = add_sos_eos(lines_zh)    
en_train = add_sos_eos(lines_en)
zh_test = add_sos_eos(lines_zh_test)
en_test = add_sos_eos(lines_en_test)
zh_val = add_sos_eos(lines_zh_val)
en_val = add_sos_eos(lines_en_val)

In [7]:
# convert token to id in the dataset
def token2index_dataset(tokens_data,eng = False):
    indices_data = []
    for tokens in tokens_data:
        index_list = []
        for token in tokens.split():
            if eng == False:
                try:
                    index_list.append(words_ft[token])
                except KeyError:
                    index_list.append(UNK_IDX)
            else:
                try:
                    index_list.append(words_ft_en[token])
                except KeyError:
                    index_list.append(UNK_IDX)
        indices_data.append(index_list)
    return indices_data

In [8]:
zh_train_indices = token2index_dataset(zh_train)
en_train_indices = token2index_dataset(en_train,eng = True)
zh_test_indices = token2index_dataset(zh_test)
en_test_indices = token2index_dataset(en_test,eng = True)

In [81]:
#max_sentence_length
length_of_en = [len(x.split()) for x in en_train]
max_sentence_length_en = sorted(length_of_en)[-int(len(length_of_en)*0.01)]
length_of_zh = [len(x.split()) for x in zh_train]
max_sentence_length_zh = sorted(length_of_zh)[-int(len(length_of_zh)*0.01)]
max_sentence_length_en = 50
max_sentence_length_zh = 50

In [82]:
max_sentence_length_zh

50

In [83]:
#Create Data Loader
import torch
from torch.utils.data import Dataset

class load_dataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list_s1,data_list_s2):
        """
        @param data_list_zh: list of Chinese tokens 
        @param data_list_en: list of English tokens as TARGETS
        """
        self.data_list_s1 = data_list_s1
        self.data_list_s2 = data_list_s2
        
        assert (len(self.data_list_s1) == len(self.data_list_s2))

    def __len__(self):
        return len(self.data_list_s1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx_s1 = self.data_list_s1[key][:max_sentence_length_zh]
        token_idx_s2 = self.data_list_s2[key][:max_sentence_length_en]
        return [token_idx_s1, token_idx_s2, len(token_idx_s1), len(token_idx_s2)]

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_s1 = []
    data_list_s2 = []
    length_list_s1 = []
    length_list_s2 = []
    for datum in batch:
        length_list_s1.append(datum[2])
        length_list_s2.append(datum[3])
        padded_vec_zh = np.pad(np.array(datum[0]), 
                                pad_width=((0,max_sentence_length_zh-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec_en = np.pad(np.array(datum[1]), 
                                pad_width=((0,max_sentence_length_en-datum[3])), 
                                mode="constant", constant_values=0)
        data_list_s1.append(padded_vec_zh[:max_sentence_length_zh])
        data_list_s2.append(padded_vec_en[:max_sentence_length_en])
    #print(type(data_list_s1[0]))
    if torch.cuda.is_available and torch.has_cudnn:
        return [torch.from_numpy(np.array(data_list_s1)).cuda(), torch.from_numpy(np.array(data_list_s2)).cuda(),
                torch.LongTensor(length_list_s1).cuda(), torch.LongTensor(length_list_s2).cuda()]
    else:    
        return [torch.from_numpy(np.array(data_list_s1)), torch.from_numpy(np.array(data_list_s2)),
                torch.LongTensor(length_list_s1), torch.LongTensor(length_list_s2)]
    


In [84]:
BATCH_SIZE = 100
EMBEDDING_SIZE = 300 # fixed as from the input embedding data

train_dataset = load_dataset(zh_train_indices, en_train_indices)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=True)

val_dataset = load_dataset(zh_test_indices, en_test_indices)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=False)

### With Attention

In [127]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_dim, hidden_size, embed= torch.from_numpy(loaded_embeddings_ft).float(),num_layers=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.num_layers = num_layers 
        
        # freeze needs to set to be false as we need the random embeddings to train with the pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(embed, freeze=False)
        self.gru = nn.GRU(emb_dim, hidden_size,num_layers=num_layers,batch_first=True,bidirectional = True)

    def forward(self, data, hidden):
        
        batch_size, seq_len = data.size()
        
        embed = self.embedding(data)
        output, hidden = self.gru(embed,hidden)
        hidden = torch.cat((hidden[0:1,:,:], hidden[1:2,:,:]), 2)

        #hidden = [n layers * n directions =1 , batch_size, hidden_size ]
        
        return output, hidden

    # initialize the hidden with random numbers
    def initHidden(self,batch_size):
        return torch.randn(2*self.num_layers, batch_size, self.hidden_size,device=device)

In [122]:
class AttnDecoderRNN(nn.Module):
    def __init__(self,emb_dim,hidden_size, output_size, embed= torch.from_numpy(loaded_embeddings_ft_en).float(),num_layers=1,
                 dropout_p=0.1, max_length=max_sentence_length_zh):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size*2
        self.num_layers = num_layers 
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding.from_pretrained(embed, freeze=False)
        self.attn = nn.Linear(self.hidden_size + emb_dim, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size + emb_dim, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, data, hidden,encoder_outputs):
        
        ### embed: [1 * batch size * emb_dim = 300 ] ###
        ### hidden: [1 * batch size * hidden_size = 300 ] ###
        ### encoder_outputs: [batch size * max_sentence_length_zh * hidden_size = 300 ] ###
        ### 因为这里concat之后，attn layer 他给的是 hidden size *2 
        ### 所以我这儿的hidden size就只能写300了 
        
        embed = self.embedding(data)
        embed = self.dropout(embed)   
 
        ### torch.cat((embed, hidden), 2)  
        ### [1 * batch size * (emb_dim + hidden_size) ]
        
        ### attn_weights: [1 * batch size * max_sentence_length_zh ]###
        ### attn_weights[0].unsqueeze(1): [batch size * 1 * max_sentence_length_zh ]###
        
        ### softmax dim=2 因为最后一个dimension是 词组什么的，不能是1，1的话就是
        ### 不同batch间这样比较了？
        #hidden = [1 * batch_size * emb_dim]
        
        attn_weights = F.softmax(
            self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1).unsqueeze(1)

        ### torch.bmm(attn_weights[0].unsqueeze(1),encoder_outputs).squeeze(1) :
        ### [batch size * 1 * hidden_size ]###
        ### attn_applied: [batch size * hidden_size (= 300) ] ###
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs).squeeze(1)
        ### output: [batch size * hidden_size (= 300) ] ###
        ### embed[0]: [batch size * hidden_size (= 300) ] ###
        output = torch.cat((embed[0], attn_applied), 1)
        ### output: [1 * batch size * hidden_size (= 300) ] ###
        output = self.attn_combine(output).unsqueeze(0)
        
        ### output: [1 * batch size * hidden_size (= 300) ] ###
        output = F.relu(output)
        
        #print(hidden.size())
        #print(output.size())

        output, hidden = self.gru(output, hidden)
        
        
        output = self.softmax(self.out(output[0]))
        
        return output, hidden, attn_weights

    def initHidden(self,batch_size):
        return torch.randn(self.num_layers, batch_size, self.hidden_size,device=device)

In [123]:
teacher_forcing_ratio = 1
#input_tensor: list of sentence tensor
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer,
          criterion,eee):
    
    ### target_tensor [batch size, max_sentence_length_en = 73] ###
    ### target_tensor [batch size, max_sentence_length_zh = 62] ###
    batch_size_1, input_length = input_tensor.size()
    batch_size_2, target_length = target_tensor.size()
    
    
    encoder_hidden = encoder.initHidden(batch_size_1)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    ### encoder_hidden: 1 * batch * hidden size ### 
    ### encoder_output: batch size * max_sentence_length_zh * hidden size ### 
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    decoder_input = torch.tensor(np.array([[SOS_IDX]]*batch_size_1).reshape(1,batch_size_1),device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    #print(use_teacher_forcing)
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            
            ### decoder_output: [batchsize,5000] ###
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden,encoder_output)
        
            
            loss += criterion(decoder_output, target_tensor[:,di])
            decoder_input = target_tensor[:,di].unsqueeze(0)  # Teacher forcing
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden,encoder_output)
                        
            ### decoder_output [batch size, 50003]  ###
            
            ### topi is a [batch size, 1] tensor first we remove the size 1
            ### demension then we add it at the beginning using squeeze
            ### 有点脑残诶，做个转置不就好了？
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            
            ### decoder_input [1, batch size]  ###
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, target_tensor[:,di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [124]:
def trainIters(encoder, decoder, n_iters, folder, print_every=1, plot_every=100, learning_rate=0.001):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    
    #--------------------------------------------	
    #	
    #    LOAD MODELS	
    #	
    #--------------------------------------------	
    	
    if not os.path.exists(folder):	
        os.makedirs(folder)	

    if os.path.exists('./attentation_model/encoder_b'):	
        print('---------------------------------------------------------------------')	
        print('----------------Readind trained model---------------------------------')	
        print('---------------------------------------------------------------------')	
        	
        #read trained models	
        encoder.load_state_dict(torch.load(folder+"/encoder_b"))
        decoder.load_state_dict(torch.load(folder+"/decoder_b"))	
    
    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    criterion = nn.CrossEntropyLoss()

    for iter in range(1, n_iters + 1):
        for i, (data_s1, data_s2, lengths_s1, lengths_s2) in enumerate(train_loader):
            input_tensor = data_s1
            target_tensor = data_s2
            #print("train",target_tensor.size())
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion,i)
            print_loss_total += loss
            plot_loss_total += loss

            if i % print_every == 0:
                print_loss_avg = print_loss_total / print_every
                print_loss_total = 0
                print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                             iter, iter / n_iters * 100, print_loss_avg))

            if i % plot_every == 0:
                plot_loss_avg = plot_loss_total / plot_every
                plot_losses.append(plot_loss_avg)
                plot_loss_total = 0
                
        # Save the model for every epoch
        print('---------------------------------------------------------------------')	
        print('----------------Saving trained model---------------------------------')	
        print('---------------------------------------------------------------------')	
      
        torch.save(encoder.state_dict(),folder +"/encoder_b")
        torch.save(decoder.state_dict(),folder +"/decoder_b")

    
    return plot_losses




In [125]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [None]:
hidden_size = 300
encoder1 = EncoderRNN(EMBEDDING_SIZE,hidden_size).to(device)
decoder1 = AttnDecoderRNN(EMBEDDING_SIZE,hidden_size, len(ordered_words_ft)).to(device)
epoch_size = 20
# ##UNCOMMENT TO TRAIN THE MODEL
# trainIters(encoder1, decoder1, epoch_size, './attention_model/GRU_TF1_H300_ES20',print_every=50)

0m 0s (- 0m 10s) (1 5%) 0.2166
0m 27s (- 8m 35s) (1 5%) 3.4213
0m 53s (- 16m 59s) (1 5%) 2.6148
1m 20s (- 25m 24s) (1 5%) 2.5240
1m 46s (- 33m 48s) (1 5%) 2.4414
2m 13s (- 42m 13s) (1 5%) 2.4618
2m 39s (- 50m 38s) (1 5%) 2.3513
3m 6s (- 59m 2s) (1 5%) 2.3600
3m 33s (- 67m 27s) (1 5%) 2.3488
3m 59s (- 75m 51s) (1 5%) 2.2940
4m 26s (- 84m 16s) (1 5%) 2.2474
4m 52s (- 92m 40s) (1 5%) 2.1991
5m 19s (- 101m 5s) (1 5%) 2.1781
5m 45s (- 109m 30s) (1 5%) 2.1724
6m 12s (- 117m 54s) (1 5%) 2.0973
6m 38s (- 126m 19s) (1 5%) 2.0806
7m 5s (- 134m 44s) (1 5%) 2.0396
7m 32s (- 143m 9s) (1 5%) 2.0493
7m 58s (- 151m 33s) (1 5%) 2.0511
8m 25s (- 159m 58s) (1 5%) 2.0343
8m 51s (- 168m 23s) (1 5%) 1.9840
9m 18s (- 176m 47s) (1 5%) 2.0068
9m 44s (- 185m 12s) (1 5%) 1.9850
10m 11s (- 193m 37s) (1 5%) 1.9651
10m 37s (- 202m 1s) (1 5%) 1.9547
11m 4s (- 210m 26s) (1 5%) 1.9098
11m 31s (- 218m 51s) (1 5%) 1.9362
11m 57s (- 227m 15s) (1 5%) 1.9437
12m 24s (- 235m 40s) (1 5%) 1.9062
12m 50s (- 244m 5s) (1 5%) 1.9

In [None]:
### Load the best models
hidden_size=300

folder = './attentation_model'	

encoder2 = EncoderRNN(EMBEDDING_SIZE,hidden_size).to(device)
decoder2 = AttnDecoderRNN(EMBEDDING_SIZE,hidden_size, len(ordered_words_ft)).to(device)

encoder2.load_state_dict(torch.load(folder+"/Encoder_b"))
decoder2.load_state_dict(torch.load(folder+"/Decoder_b"))

In [26]:
#loader can be test_loader or val_loader
def evaluate(loader, encoder, decoder, beam = False, beam_k = 1):
    out_predictions = ''
    out_references = ''
    with torch.no_grad():
        for i, (data_s1, data_s2, lengths_s1, lengths_s2) in enumerate(loader):
            input_tensor = data_s1
            input_length = input_tensor.size()[0]
            #sentence_length to the output length
            sentence_length = data_s2.size()[1]
            encoder_hidden = encoder.initHidden(input_length)

            encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
            
            #decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
            decoder_input = torch.tensor(np.array([[SOS_IDX]]*input_length).reshape(1,input_length),device=device)

            decoder_hidden = encoder_hidden

            decoder_attentions = torch.zeros(sentence_length, sentence_length)
            decoded_words_eval = []
            sequences = [[list(), 1.0]]*input_length
            for di in range(sentence_length):
                decoded_words_sub = []
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_output)
                # decoder_attentions[di] = decoder_attention.data
                # topk(1) - softmax probability maximum
                if beam == True:
                    pass
#                     topv, topi = decoder_output.data.topk(beam_k)
#                     #batch loop
#                     C = []
#                     for idx, ind in enumerate(topi):
#                         H, _ = sequences[idx]
#                         for ele in ind:
#                             if ele.item() == EOS_IDX:
#                                 H.append('<EOS>')
#                             else:
#                                 H.append(idx2words_ft_en[ele.item()])
                         
                else:
                    topv, topi = decoder_output.data.topk(1) 
                #batch loop
                
                eos_flag = False
                for ind in topi:
                    if eos_flag == False:
                        if ind.item() == EOS_IDX:
                            decoded_words_sub.append('</s>')
                            eos_flag = True
                            #break
                        else:
                            decoded_words_sub.append(idx2words_ft_en[ind.item()])
                    else:
                        decoded_words_sub.append("<pad>")
                        
                
                decoded_words_eval.append(decoded_words_sub)
                
                #swap dimensions of decoded_words to [batch_size * 377]
                
                #decoded_words_new = [[i for i in ele] for ele in list(zip(*decoded_words_eval))]

                #change the dimension
                decoder_input = topi.squeeze().detach()
                decoder_input = decoder_input.unsqueeze(0)
            
            

            listed_predictions = ''
            
            
            decoded_words_new = [[i for i in ele] for ele in list(zip(*decoded_words_eval))]
            for token_list in decoded_words_new:
                sent = ' '.join(str(token) for token in token_list if token!="<pad>")
                #print (sent)
                listed_predictions= list_predictions + ' '+ sent
                
            listed_reference = ' '
            for ele in data_s2:
                sent = index2token_sentence(ele)
                #print (tokens)
                #sent = ' '.join(tokens)
                #print (sent)
                listed_reference = listed_reference + ' ' + sent
            
        out_predictions = out_predictions + ' ' + listed_predictions
        out_references = out_references + ' ' + listed_reference
        bleu_score = corpus_bleu(out_predictions, out_references)
            
            #uncommon to print prediction and reference
            #print (listed_predictions)
            #print (listed_reference)
            
            
        print('BLEU Score is %s' % (str(bleu_score.score)))
           
        return bleu_score, decoded_words_new, decoder_attentions[:di + 1]
    
def index2token_batch(list_of_list):
    return ' '.join(idx2words_ft_en[r.item()] for v in list_of_list for r in v if r.item()!=PAD_IDX)
def index2token_sentence(sentence_batch):
    return ' '.join(idx2words_ft_en[sent.item()] for sent in sentence_batch if sent.item()!=PAD_IDX)

In [29]:
blue, output_words, attentions = evaluate(val_loader,
    encoder1, decoder1)

BLEU Score is 20.099632842263766
BLEU Score is 17.2331000405467
BLEU Score is 18.263529593949873
BLEU Score is 18.156729843585964
BLEU Score is 16.248083583710795
BLEU Score is 16.6067262093732
BLEU Score is 23.67949056871905
BLEU Score is 17.897392446235028
BLEU Score is 20.546576246012076
BLEU Score is 20.05563999110098
BLEU Score is 16.732889645990205
BLEU Score is 16.704218238382563
BLEU Score is 18.362462385489334
BLEU Score is 20.55813622830437
