In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re  
import random
import numpy as np
import os
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import collections
from itertools import dropwhile
import pickle as pkl

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
#!pip3 install sacrebleu
from sacrebleu import corpus_bleu

In [3]:
#read in chinese-english pairs
#read in chinese-english pairs
lines_zh = open('iwslt-zh-en/train.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en = open('iwslt-zh-en/train.tok.en',encoding = 'utf-8').read().strip().split('\n')
lines_zh_test = open('iwslt-zh-en/test.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en_test = open('iwslt-zh-en/test.tok.en',encoding = 'utf-8').read().strip().split('\n')
lines_zh_val = open('iwslt-zh-en/dev.tok.zh',encoding = 'utf-8').read().strip().split('\n')
lines_en_val = open('iwslt-zh-en/dev.tok.en',encoding = 'utf-8').read().strip().split('\n')

In [4]:
def delect_least_common_words(list_sent, threshold = 5):
    ret_list =[]
    for x in list_sent:
        ret_list += x.split()
    ret_dic = collections.Counter(ret_list)
    for key, count in dropwhile(lambda key_count: key_count[1] >= threshold, ret_dic.most_common()):
        del ret_dic[key]
    return list(ret_dic.keys())

In [5]:
zh_words = delect_least_common_words(lines_zh)
en_words = delect_least_common_words(lines_en)

In [6]:
words_to_load = 100000
PAD_IDX = 0
UNK_IDX = 1
SOS_IDX = 2
EOS_IDX = 3

with open('cc.zh.300.vec') as f:
    loaded_embeddings_ft = np.zeros((words_to_load+3, 300))
    words_ft = {}
    idx2words_ft = {}
    ordered_words_ft = []
    ordered_words_ft.extend(['<pad>', '<unk>', '<s>'])
    loaded_embeddings_ft[0,:] = np.zeros(300)
    loaded_embeddings_ft[1,:] = np.random.normal(size = 300)
    loaded_embeddings_ft[2,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft[i+3, :] = np.asarray(s[1:])
        words_ft[s[0]] = i+3
        idx2words_ft[i+3] = s[0]
        ordered_words_ft.append(s[0])
    length = len(np.setdiff1d(zh_words, ordered_words_ft))
    tmp_embeddings = np.zeros((length, 300))
    for idx, word in enumerate(np.setdiff1d(zh_words, ordered_words_ft)):
        words_ft[word] = idx+words_to_load+3
        idx2words_ft[idx+words_to_load+3] = word
        tmp_embeddings[idx, :] = np.random.normal(size = 300)
    loaded_embeddings_ft = np.concatenate((loaded_embeddings_ft, tmp_embeddings), axis = 0)
    words_ft['<pad>'] = PAD_IDX
    words_ft['<unk>'] = UNK_IDX
    words_ft['<s>'] = SOS_IDX
    idx2words_ft[PAD_IDX] = '<pad>'
    idx2words_ft[UNK_IDX] = '<unk>'
    idx2words_ft[SOS_IDX] = '<s>'
    ordered_words_ft = list(words_ft.keys())

In [7]:
#English embedding
with open('wiki-news-300d-1M.vec') as f:
    loaded_embeddings_ft_en = np.zeros((words_to_load+4, 300))
    words_ft_en = {}
    idx2words_ft_en = {}
    ordered_words_ft_en = []
    ordered_words_ft_en.extend(['<pad>', '<unk>', '<s>', '</s>'])
    loaded_embeddings_ft_en[0,:] = np.zeros(300)
    loaded_embeddings_ft_en[1,:] = np.random.normal(size = 300)
    loaded_embeddings_ft_en[2,:] = np.random.normal(size = 300)
    loaded_embeddings_ft_en[3,:] = np.random.normal(size = 300)
    for i, line in enumerate(f):
        if i >= words_to_load: 
            break
        s = line.split()
        loaded_embeddings_ft_en[i+4, :] = np.asarray(s[1:])
        words_ft_en[s[0]] = i+4
        idx2words_ft_en[i+4] = s[0]
        ordered_words_ft_en.append(s[0])
    length = len(np.setdiff1d(en_words, ordered_words_ft_en))
    tmp_embeddings = np.zeros((length, 300))
    for idx, word in enumerate(np.setdiff1d(en_words, ordered_words_ft_en)):
        words_ft_en[word] = idx+words_to_load+4
        idx2words_ft_en[idx+words_to_load+4] = word
        tmp_embeddings[idx, :] = np.random.normal(size = 300)
    loaded_embeddings_ft_en = np.concatenate((loaded_embeddings_ft_en, tmp_embeddings), axis = 0)
    words_ft_en['<pad>'] = PAD_IDX
    words_ft_en['<unk>'] = UNK_IDX
    words_ft_en['<s>'] = SOS_IDX
    words_ft_en['</s>'] = EOS_IDX
    idx2words_ft_en[PAD_IDX] = '<pad>'
    idx2words_ft_en[UNK_IDX] = '<unk>'
    idx2words_ft_en[SOS_IDX] = '<s>'
    idx2words_ft_en[EOS_IDX] = '</s>'
    ordered_words_ft_en = list(words_ft_en.keys())

In [8]:
#add sos and eos in each sentence
def add_sos_eos(lines):
    
    train = []
    for l in lines:
        l = '<s> ' + l + ' </s>'
        train.append(l)
    return train
zh_train = add_sos_eos(lines_zh)    
en_train = add_sos_eos(lines_en)
zh_test = add_sos_eos(lines_zh_test)
en_test = add_sos_eos(lines_en_test)
zh_val = add_sos_eos(lines_zh_val)
en_val = add_sos_eos(lines_en_val)

In [9]:
# convert token to id in the dataset
def token2index_dataset(tokens_data,eng = False):
    indices_data = []
    for tokens in tokens_data:
        index_list = []
        for token in tokens.split():
            if eng == False:
                try:
                    index_list.append(words_ft[token])
                except KeyError:
                    index_list.append(UNK_IDX)
            else:
                try:
                    index_list.append(words_ft_en[token])
                except KeyError:
                    index_list.append(UNK_IDX)
        indices_data.append(index_list)
    return indices_data

In [10]:
zh_train_indices = token2index_dataset(zh_train)
en_train_indices = token2index_dataset(en_train,eng = True)
zh_test_indices = token2index_dataset(zh_test)
en_test_indices = token2index_dataset(en_test,eng = True)

In [11]:
#max_sentence_length
length_of_en = [len(x.split()) for x in en_train]
max_sentence_length_en = sorted(length_of_en)[-int(len(length_of_en)*0.01)]
length_of_zh = [len(x.split()) for x in zh_train]
max_sentence_length_zh = sorted(length_of_zh)[-int(len(length_of_zh)*0.01)]

In [12]:
max_sentence_length_zh

69

In [13]:
#Create Data Loader
import torch
from torch.utils.data import Dataset

class load_dataset(Dataset):
    """
    Class that represents a train/validation/test dataset that's readable for PyTorch
    Note that this class inherits torch.utils.data.Dataset
    """
    
    def __init__(self, data_list_s1,data_list_s2):
        """
        @param data_list_zh: list of Chinese tokens 
        @param data_list_en: list of English tokens as TARGETS
        """
        self.data_list_s1 = data_list_s1
        self.data_list_s2 = data_list_s2
        
        assert (len(self.data_list_s1) == len(self.data_list_s2))

    def __len__(self):
        return len(self.data_list_s1)
        
    def __getitem__(self, key):
        """
        Triggered when you call dataset[i]
        """
        
        token_idx_s1 = self.data_list_s1[key][:max_sentence_length_zh]
        token_idx_s2 = self.data_list_s2[key][:max_sentence_length_en]
        return [token_idx_s1, token_idx_s2, len(token_idx_s1), len(token_idx_s2)]

def collate_func(batch):
    """
    Customized function for DataLoader that dynamically pads the batch so that all 
    data have the same length
    """
    data_list_s1 = []
    data_list_s2 = []
    length_list_s1 = []
    length_list_s2 = []
    for datum in batch:
        length_list_s1.append(datum[2])
        length_list_s2.append(datum[3])
        padded_vec_zh = np.pad(np.array(datum[0]), 
                                pad_width=((0,max_sentence_length_zh-datum[2])), 
                                mode="constant", constant_values=0)
        padded_vec_en = np.pad(np.array(datum[1]), 
                                pad_width=((0,max_sentence_length_en-datum[3])), 
                                mode="constant", constant_values=0)
        data_list_s1.append(padded_vec_zh[:max_sentence_length_zh])
        data_list_s2.append(padded_vec_en[:max_sentence_length_en])
    #print(type(data_list_s1[0]))
    if torch.cuda.is_available and torch.has_cudnn:
        return [torch.from_numpy(np.array(data_list_s1)).cuda(), torch.from_numpy(np.array(data_list_s2)).cuda(),
                torch.LongTensor(length_list_s1).cuda(), torch.LongTensor(length_list_s2).cuda()]
    else:    
        return [torch.from_numpy(np.array(data_list_s1)), torch.from_numpy(np.array(data_list_s2)),
                torch.LongTensor(length_list_s1), torch.LongTensor(length_list_s2)]
    


In [14]:
BATCH_SIZE = 50
EMBEDDING_SIZE = 300 # fixed as from the input embedding data

train_dataset = load_dataset(zh_train_indices, en_train_indices)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=True)

val_dataset = load_dataset(zh_test_indices, en_test_indices)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset,
                                           batch_size=BATCH_SIZE,
                                           collate_fn=collate_func,
                                           shuffle=False)

### With Attention

In [15]:
class EncoderRNN(nn.Module):
    def __init__(self, emb_dim, hidden_size, embed= torch.from_numpy(loaded_embeddings_ft).float(),num_layers=1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.emb_dim = emb_dim
        self.num_layers = num_layers 
        
        # freeze needs to set to be false as we need the random embeddings to train with the pretrained embeddings
        self.embedding = nn.Embedding.from_pretrained(embed, freeze=False)
        self.gru = nn.GRU(emb_dim, hidden_size,num_layers=num_layers,batch_first=True,bidirectional = True)

    def forward(self, data, hidden):
        
        batch_size, seq_len = data.size()
        
        embed = self.embedding(data)
        output, hidden = self.gru(embed,hidden)
#         hidden = torch.cat((hidden[0:1,:,:], hidden[1:2,:,:]), 2)
        hidden = torch.sum(hidden, dim = 0).unsqueeze(0)
        output = (output[:, :, :self.hidden_size] +
                output[:, :, self.hidden_size:])
        #hidden = [n layers * n directions =1 , batch_size, hidden_size ]
        return output, hidden

    # initialize the hidden with random numbers
    def initHidden(self,batch_size):
        return torch.randn(2*self.num_layers, batch_size, self.hidden_size,device=device)

In [16]:
class AttnDecoderRNN(nn.Module):
    def __init__(self,emb_dim,hidden_size, output_size, embed= torch.from_numpy(loaded_embeddings_ft_en).float(),num_layers=1,
                 dropout_p=0.1, max_length=max_sentence_length_zh):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers 
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding.from_pretrained(embed, freeze=False)
        self.attn = nn.Linear(self.hidden_size, self.hidden_size)
        self.attn_combine = nn.Linear(self.hidden_size *2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)

        self.gru = nn.GRU(emb_dim, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, data, hidden,encoder_outputs):
        
        ### embed: [1 * batch size * emb_dim = 300 ] ###
        ### hidden: [1 * batch size * hidden_size = 300 ] ###
        ### encoder_outputs: [batch size * max_sentence_length_zh * hidden_size = 300 ] ###
        ### 因为这里concat之后，attn layer 他给的是 hidden size *2 
        ### 所以我这儿的hidden size就只能写300了 
        
        embed = self.embedding(data)
        embed = self.dropout(embed)
 
        ### torch.cat((embed, hidden), 2)  
        ### [1 * batch size * (emb_dim + hidden_size) ]
        
        ### attn_weights: [1 * batch size * max_sentence_length_zh ]###
        ### attn_weights[0].unsqueeze(1): [batch size * 1 * max_sentence_length_zh ]###
        
        ### softmax dim=2 因为最后一个dimension是 词组什么的，不能是1，1的话就是
        ### 不同batch间这样比较了？
        #hidden = [1 * batch_size * emb_dim]
        gru_out, hidden = self.gru(embed, hidden)
#         attn_weights = F.softmax(
#             self.attn(torch.cat((embed[0], hidden[0]), 1)), dim=1).unsqueeze(1)
        attn_weights0 = self.attn(hidden).transpose(0,1)
        attn_prod = torch.bmm(attn_weights0, encoder_outputs.transpose(1,2))
        ### torch.bmm(attn_weights[0].unsqueeze(1),encoder_outputs).squeeze(1) :
        ### [batch size * 1 * hidden_size ]###
        ### attn_applied: [batch size * hidden_size (= 300) ] ###
#         attn_applied = torch.bmm(attn_weights,
#                                  encoder_outputs).squeeze(1)
        ### output: [batch size * hidden_size (= 300) ] ###
        ### embed[0]: [batch size * hidden_size (= 300) ] ###
        attn_weights = F.softmax(attn_prod, dim = 2)
#         print(attn_weights.shape)
        context = torch.bmm(attn_weights, encoder_outputs)
        hc = torch.cat([hidden, context.transpose(0,1)], dim =2)
        out_hc = torch.tanh(self.attn_combine(hc))
        output = self.softmax(self.out(out_hc)[0])
#         output = torch.cat((embed[0], attn_applied), 1)
        ### output: [1 * batch size * hidden_size (= 300) ] ###
#         output = self.attn_combine(output).unsqueeze(0)
        ### output: [1 * batch size * hidden_size (= 300) ] ###
#         output = F.relu(output)
        
        #print(hidden.size())
        #print(output.size())

#         output, hidden = self.gru(output, hidden)
        
#         output = self.softmax(self.out(output[0]))
        
        return output, hidden, attn_weights

    def initHidden(self,batch_size):
        return torch.randn(self.num_layers, batch_size, self.hidden_size,device=device)

In [17]:
teacher_forcing_ratio = 1
#input_tensor: list of sentence tensor
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer,
          criterion,eee):
    
    ### target_tensor [batch size, max_sentence_length_en = 73] ###
    ### target_tensor [batch size, max_sentence_length_zh = 62] ###
    batch_size_1, input_length = input_tensor.size()
    batch_size_2, target_length = target_tensor.size()
    
    
    encoder_hidden = encoder.initHidden(batch_size_1)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    loss = 0

    ### encoder_hidden: 1 * batch * hidden size ### 
    ### encoder_output: batch size * max_sentence_length_zh * hidden size ### 
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)

    decoder_input = torch.tensor(np.array([[SOS_IDX]]*batch_size_1).reshape(1,batch_size_1),device=device)
    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    #print(use_teacher_forcing)
    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            
            ### decoder_output: [batchsize,5000] ###
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden,encoder_output)
        
            
            loss += criterion(decoder_output, target_tensor[:,di])
            decoder_input = target_tensor[:,di].unsqueeze(0)  # Teacher forcing
            
    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden,encoder_output)
                        
            ### decoder_output [batch size, 50003]  ###
            
            ### topi is a [batch size, 1] tensor first we remove the size 1
            ### demension then we add it at the beginning using squeeze
            ### 有点脑残诶，做个转置不就好了？
            
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input
            
            ### decoder_input [1, batch size]  ###
            decoder_input = decoder_input.unsqueeze(0)
 
            loss += criterion(decoder_output, target_tensor[:,di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [22]:
from torch.optim.lr_scheduler import StepLR, LambdaLR
def trainIters(encoder, decoder, n_iters, folder,lr_decrease = False,print_every=1, plot_every=100, evaluate_every = 50,read_in_model = False,learning_rate=0.001,early_stop_tol = 10e-7):
    start = time.time()
    plot_losses = []
    plot_val = []
    
    loss_history = []
   
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    patience = 0
    
    early_stopped = False
    current_best_bleu = 0
    
    best_encoder = encoder.state_dict()
    best_decoder = decoder.state_dict()
    
    
    #--------------------------------------------	
    #	
    #    LOAD MODELS	
    #	
    #--------------------------------------------	
    	
        
    
    if not os.path.exists(folder):	
        os.makedirs(folder)	

    if read_in_model == True:
        if os.path.exists(folder+'/Encoder'):	
            print('---------------------------------------------------------------------')	
            print('----------------Readind trained model---------------------------------')	
            print('---------------------------------------------------------------------')	

            #read trained models	
            encoder.load_state_dict(torch.load(folder+"/Encoder"))
            decoder.load_state_dict(torch.load(folder+"/Decoder"))	

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    
    if lr_decrease == True:
        encoder_scheduler = StepLR(encoder_optimizer, step_size=1, gamma=0.8)
        decoder_scheduler = StepLR(decoder_optimizer, step_size=1, gamma=0.8)
    
    
    criterion = nn.CrossEntropyLoss()
    #criterion_val = nn.CrossEntropyLoss()

    last_val = 0
    for iter in range(1, n_iters + 1):
        if lr_decrease == True:
            encoder_scheduler.step()
            decoder_scheduler.step()
        for i, (data_s1, data_s2, lengths_s1, lengths_s2) in enumerate(train_loader):
            input_tensor = data_s1
            target_tensor = data_s2
            #print("train",target_tensor.size())
            loss = train(input_tensor, target_tensor, encoder,
                         decoder, encoder_optimizer, decoder_optimizer, criterion,i)
            print_loss_total += loss
            plot_loss_total += loss

            if i % print_every == 0:
                if i != 0:
                    print_loss_avg = print_loss_total / print_every
                    print_loss_total = 0
                    print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                                 iter, iter / n_iters * 100, print_loss_avg))
                    loss_history.append(print_loss_avg)
                else:
                    print_loss_total = 0
                
            if i % plot_every == 0:
                if i != 0:
                    plot_loss_avg = plot_loss_total / plot_every
                    plot_losses.append(plot_loss_avg)
                    plot_loss_total = 0
                    
                    
                else:
                    plot_loss_total = 0
                
            if i % evaluate_every == 0:
                if i != 0:
                    bleu_score,output_words,attentions = evaluate(val_loader, encoder, decoder)
                    if bleu_score > current_best_bleu:
                        current_best_bleu = bleu_score
                        
                        best_encoder = encoder.state_dict()
                        best_decoder = decoder.state_dict()
                        
                    plot_val.append(bleu_score)
                    #print ("BLEU: ",bleu_score)
                    
                    if bleu_score <= current_best_bleu:
                        patience += 1
                        
                    elif bleu_score > current_best_bleu and np.abs(bleu_score - current_best_bleu)/float(current_best_bleu) < early_stop_tol:
                        patience += 1
                    
                    else:
                        patience = 0
                        
                        
                    if patience == 10:
                       
                        torch.save(best_encoder,folder +"/Encoder")
                        torch.save(best_decoder,folder +"/Decoder")
                        early_stopped = True
                        patience = 0
            
                        
                    last_val = bleu_score
                 
        if early_stopped == False:
        
            # Save the model for every epoch
            print('---------------------------------------------------------------------')	
            print('----------------Saving trained model---------------------------------')	
            print('---------------------------------------------------------------------')	

            torch.save(encoder.state_dict(),folder +"/Encoder")
            torch.save(decoder.state_dict(),folder +"/Decoder")
            
    with open(folder+"/loss_hist", 'wb') as f:
         pkl.dump(loss_history, f)
    with open(folder+"/bleu_hist", 'wb') as f:
         pkl.dump(plot_val, f)
    showPlot(plot_losses,title = "Train Loss",name = folder+"/loss.jpeg")
    showPlot(plot_val, title = "BLEU Score on Validation Set",name = folder+"/bleu.jpeg")
    return plot_losses

In [23]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points,title,name):
    plt.figure()
    
    plt.plot(points)
    plt.title(title)
    plt.savefig(name)
    
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))


In [24]:
#loader can be test_loader or val_loader
def evaluate(loader, encoder, decoder, after_train_mode = False,beam = False, beam_k = 1):
    bleu_score_list = []
    big_pred_list = []
    big_ref_list = []
    with torch.no_grad():
        for i, (data_s1, data_s2, lengths_s1, lengths_s2) in enumerate(loader):
            input_tensor = data_s1
            input_length = input_tensor.size()[0]
            #sentence_length to the output length
            sentence_length = data_s2.size()[1]
            encoder_hidden = encoder.initHidden(input_length)

            encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
            
            #decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS
            decoder_input = torch.tensor(np.array([[SOS_IDX]]*input_length).reshape(1,input_length),device=device)

            decoder_hidden = encoder_hidden

            decoder_attentions = torch.zeros(sentence_length, sentence_length)
            decoded_words_eval = []
            for di in range(sentence_length):
                decoded_words_sub = []
                decoder_output, decoder_hidden, decoder_attention = decoder(
                    decoder_input, decoder_hidden, encoder_output)
                # decoder_attentions[di] = decoder_attention.data
                # topk(1) - softmax probability maximum
                if beam == True:
                    pass
#                     topv, topi = decoder_output.data.topk(beam_k)
#                     #batch loop
#                     C = []
#                     for idx, ind in enumerate(topi):
#                         H, _ = sequences[idx]
#                         for ele in ind:
#                             if ele.item() == EOS_IDX:
#                                 H.append('<EOS>')
#                             else:
#                                 H.append(idx2words_ft_en[ele.item()])
                         
                else:
                    topv, topi = decoder_output.data.topk(1) 
                    
                #batch loop
                
                
                for ind in topi:
                    
                    if ind.item() == EOS_IDX:
                        
                        decoded_words_sub.append(idx2words_ft_en[EOS_IDX])
                        
                    else:
                        decoded_words_sub.append(idx2words_ft_en[ind.item()])
                    
                
                decoded_words_eval.append(decoded_words_sub)
                
                #swap dimensions of decoded_words to [batch_size * 377]
                
                #decoded_words_new = [[i for i in ele] for ele in list(zip(*decoded_words_eval))]

                #change the dimension
                decoder_input = topi.squeeze().detach()
                decoder_input = decoder_input.unsqueeze(0)
            
            
            pred_num = 0
            listed_predictions = []
            
            
            decoded_words_new = [[i for i in ele] for ele in list(zip(*decoded_words_eval))]
            
            for token_list in decoded_words_new:
                sent = ' '.join(str(token) for token in token_list if token!="<pad>" and token!="<s>" and token!="</s>")
                #print (sent)
                listed_predictions.append(sent)
                #print (sent)
                pred_num += 1
                
            ref_num = 0
            listed_reference = []
            for ele in data_s2:
                sent = index2token_sentence(ele)
                #print (tokens)
                #sent = ' '.join(tokens)
                #print (sent)
                listed_reference.append(sent)
                ref_num += 1
            
            big_pred_list += listed_predictions
            big_ref_list += listed_reference
            
            assert len(big_pred_list) == len(big_ref_list)
            
            
            #uncommon to print prediction and reference
            #print (listed_predictions)
            #print (listed_reference)
        bleu_score = corpus_bleu(big_pred_list,[big_ref_list]).score
        
        if after_train_mode == True:
            for idx,ele in enumerate(big_pred_list):
                print (ele)
                print (big_ref_list[idx])
                print ("\n")
                
                
    print('BLEU Score is %s' % (str(bleu_score)))
        

    return bleu_score, decoded_words_new, decoder_attentions[:di + 1]
    
def index2token_batch(list_of_list):
    return ' '.join(idx2words_ft_en[r.item()] for v in list_of_list for r in v if r.item()!=PAD_IDX)
def index2token_sentence(sentence_batch):
    return ' '.join(idx2words_ft_en[sent.item()] for sent in sentence_batch if sent.item()!=PAD_IDX and sent.item()!=SOS_IDX and sent.item()!=EOS_IDX)

In [None]:
hidden_size = 200
encoder1 = EncoderRNN(EMBEDDING_SIZE,hidden_size).to(device)
decoder1 = AttnDecoderRNN(EMBEDDING_SIZE,hidden_size, len(ordered_words_ft_en)).to(device)
epoch_size = 20
folder = 'GRU_LR005_decay_H200_ES20'
##UNCOMMENT TO TRAIN THE MODEL
trainIters(encoder1, decoder1, epoch_size, folder ,lr_decrease = True,print_every=50,plot_every = 100, evaluate_every = 250,learning_rate=0.005)

1m 20s (- 25m 29s) (1 5%) 2.3886
2m 39s (- 50m 28s) (1 5%) 1.8313
3m 58s (- 75m 27s) (1 5%) 1.7251
5m 17s (- 100m 26s) (1 5%) 1.6986
6m 36s (- 125m 25s) (1 5%) 1.7282




BLEU Score is 3.6583089830224207
8m 11s (- 155m 41s) (1 5%) 1.6468
9m 30s (- 180m 41s) (1 5%) 1.5996
10m 49s (- 205m 41s) (1 5%) 1.6232
12m 8s (- 230m 41s) (1 5%) 1.6190
13m 27s (- 255m 41s) (1 5%) 1.6218




BLEU Score is 3.603796338835871
15m 3s (- 286m 8s) (1 5%) 1.5904
16m 22s (- 311m 8s) (1 5%) 1.5467
17m 41s (- 336m 7s) (1 5%) 1.5542
19m 0s (- 361m 7s) (1 5%) 1.5194
20m 19s (- 386m 7s) (1 5%) 1.5294




BLEU Score is 1.9247031266697028
21m 55s (- 416m 38s) (1 5%) 1.5158
23m 14s (- 441m 38s) (1 5%) 1.5019
24m 33s (- 466m 38s) (1 5%) 1.4887
25m 52s (- 491m 38s) (1 5%) 1.4559
27m 11s (- 516m 38s) (1 5%) 1.4580




BLEU Score is 3.440815374037176
28m 47s (- 547m 6s) (1 5%) 1.4529
30m 6s (- 572m 5s) (1 5%) 1.4718
31m 25s (- 597m 5s) (1 5%) 1.4155
32m 44s (- 622m 5s) (1 5%) 1.4144
34m 3s (- 647m 5s) (1 5%) 1.4213




BLEU Score is 4.485316400564601
35m 39s (- 677m 26s) (1 5%) 1.4134
36m 58s (- 702m 26s) (1 5%) 1.4099
38m 17s (- 727m 26s) (1 5%) 1.3963
39m 36s (- 752m 26s) (1 5%) 1.4004
40m 55s (- 777m 27s) (1 5%) 1.3959




BLEU Score is 3.3100558327265346
42m 31s (- 807m 49s) (1 5%) 1.3931
43m 49s (- 832m 49s) (1 5%) 1.3746
45m 8s (- 857m 49s) (1 5%) 1.4131
46m 27s (- 882m 48s) (1 5%) 1.3809
47m 46s (- 907m 47s) (1 5%) 1.4008




BLEU Score is 4.601102499723088
49m 22s (- 938m 12s) (1 5%) 1.3923
50m 41s (- 963m 12s) (1 5%) 1.3367
52m 0s (- 988m 13s) (1 5%) 1.3931
53m 19s (- 1013m 13s) (1 5%) 1.3344
54m 38s (- 1038m 13s) (1 5%) 1.3555




BLEU Score is 4.966566277666917
56m 14s (- 1068m 38s) (1 5%) 1.3793
57m 33s (- 1093m 38s) (1 5%) 1.4092
58m 52s (- 1118m 39s) (1 5%) 1.3522
60m 11s (- 1143m 40s) (1 5%) 1.3631
61m 30s (- 1168m 41s) (1 5%) 1.3485




BLEU Score is 4.591230949212597
63m 6s (- 1199m 5s) (1 5%) 1.4000
64m 25s (- 1224m 5s) (1 5%) 1.3559
65m 44s (- 1249m 6s) (1 5%) 1.3584
67m 3s (- 1274m 6s) (1 5%) 1.3698
68m 22s (- 1299m 7s) (1 5%) 1.3923




BLEU Score is 5.38867492677869
69m 58s (- 1329m 36s) (1 5%) 1.3724
71m 17s (- 1354m 36s) (1 5%) 1.3298
72m 36s (- 1379m 36s) (1 5%) 1.3791
73m 55s (- 1404m 36s) (1 5%) 1.3786
75m 14s (- 1429m 36s) (1 5%) 1.3662




BLEU Score is 3.9101886264873937
76m 50s (- 1460m 1s) (1 5%) 1.3726
78m 9s (- 1485m 1s) (1 5%) 1.3332
79m 28s (- 1510m 2s) (1 5%) 1.3363
80m 47s (- 1535m 2s) (1 5%) 1.3928
82m 6s (- 1560m 2s) (1 5%) 1.3506




BLEU Score is 4.903363054785668
83m 42s (- 1590m 25s) (1 5%) 1.3544
85m 1s (- 1615m 26s) (1 5%) 1.3771
86m 20s (- 1640m 26s) (1 5%) 1.3586
87m 39s (- 1665m 26s) (1 5%) 1.3858
88m 58s (- 1690m 26s) (1 5%) 1.3390




BLEU Score is 4.903806339214065
90m 34s (- 1720m 52s) (1 5%) 1.3750
91m 53s (- 1745m 53s) (1 5%) 1.3555
93m 12s (- 1770m 53s) (1 5%) 1.3435
94m 31s (- 1795m 54s) (1 5%) 1.3527
95m 50s (- 1820m 55s) (1 5%) 1.3591




BLEU Score is 4.9597928735611605
97m 26s (- 1851m 20s) (1 5%) 1.3249
98m 45s (- 1876m 20s) (1 5%) 1.3251
100m 4s (- 1901m 21s) (1 5%) 1.3334
101m 23s (- 1926m 21s) (1 5%) 1.3405
102m 42s (- 1951m 21s) (1 5%) 1.3272




BLEU Score is 5.264438162655568
104m 18s (- 1981m 48s) (1 5%) 1.3149
105m 37s (- 2006m 48s) (1 5%) 1.3445
106m 56s (- 2031m 49s) (1 5%) 1.3437
108m 15s (- 2056m 49s) (1 5%) 1.3224
109m 34s (- 2081m 50s) (1 5%) 1.3451




BLEU Score is 5.2975284422504
111m 9s (- 2112m 7s) (1 5%) 1.3588
112m 28s (- 2137m 7s) (1 5%) 1.3463
113m 47s (- 2162m 8s) (1 5%) 1.3330
115m 6s (- 2187m 8s) (1 5%) 1.3435
116m 25s (- 2212m 8s) (1 5%) 1.3450




BLEU Score is 5.794581713506847
118m 29s (- 1066m 27s) (2 10%) 1.2645
119m 48s (- 1078m 18s) (2 10%) 1.2492
121m 7s (- 1090m 8s) (2 10%) 1.2684
122m 26s (- 1101m 59s) (2 10%) 1.2783


In [None]:
hidden_size = 300
encoder1 = EncoderRNN(EMBEDDING_SIZE,hidden_size).to(device)
decoder1 = AttnDecoderRNN(EMBEDDING_SIZE,hidden_size, len(ordered_words_ft_en)).to(device)
epoch_size = 20
folder = './attention_model/GRU_LR001_decay_H300_ES20'
##UNCOMMENT TO TRAIN THE MODEL
trainIters(encoder1, decoder1, epoch_size, folder ,lr_decrease = True,print_every=50,plot_every = 100, evaluate_every = 250,learning_rate=0.001)

In [29]:
score_list, output_words, attentions = evaluate(val_loader, encoder1, decoder1,after_train_mode =True)

NameError: name 'encoder1' is not defined