In [1]:
import re
import spacy
from torchtext.data import Field, BucketIterator
from torchtext.datasets import Multi30k
from torchtext import datasets, data

import math
import torch
import random
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F

import os
import math
import argparse
import torch
from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm
from torch.nn import functional as F
import numpy as np
import copy


In [2]:
SOS_TOKEN = '<sos>' # XXX : it must be in vocab!
EOS_TOKEN = '<eos>'
PAD_INDEX = 13 # TODO
UNK_INDEX = 1 # TODO

MAX_SENS_LENGTH = 20

def load_dataset_old(batch_size, filename):
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    EN = Field(tokenize=tokenize_en, include_lengths=False, init_token=SOS_TOKEN, eos_token=EOS_TOKEN)
    
    train = datasets.TranslationDataset('./', exts=(filename, filename), fields=(EN, EN))
    EN.build_vocab(train.trg, max_size=10000)
    
    train_iter, test_iter  = BucketIterator.splits((train, train), batch_size=batch_size, repeat=False)
    return train_iter

def load_dataset(batch_size, filename):
    spacy_en = spacy.load('en')
    url = re.compile('(<url>.*</url>)')

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]

    with open(filename, 'rt') as input_file:
        inputs = input_file.readlines()
    
    inputs = [[SOS_TOKEN] +  tokenize_en(_) + [EOS_TOKEN] for _ in inputs]
    inputs = [_ for _ in inputs if len(_) < MAX_SENS_LENGTH]
    return inputs


In [3]:
class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, embeddings = None, n_layers = 1, dropout = 0.5):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        
        self.embed_size = embed_size
        self.embed = nn.Embedding(input_size, embed_size)
        if embeddings is not None:
            self.embed.weight.data = torch.Tensor(embeddings)#.cuda() # TODO : need cuda here?
            
        # preparation for freeze
        self.embed.weight.requires_grad = False
            
        self.gru = nn.GRU(embed_size, hidden_size, n_layers, dropout = dropout, bidirectional = True)

    def forward(self, src, hidden=None):

        embedded = self.embed(src)
        outputs, hidden = self.gru(embedded, hidden)
        # sum bidirectional outputs
        outputs = (outputs[:, :, :self.hidden_size] +
                   outputs[:, :, self.hidden_size:])
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        energy = self.attn(torch.cat([hidden, encoder_outputs], 2))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]


class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, embeddings = None, n_layers = 1, dropout = 0.2):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embed = nn.Embedding(output_size, embed_size)
        
        if embeddings is not None:
            self.embed.weight.data = torch.Tensor(embeddings)#.cuda() # TODO : need cuda here?
            
        # preparation for freeze
        self.embed.weight.requires_grad = False
        
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size, n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size * 2, output_size)

    def forward(self, input, last_hidden, encoder_outputs):
        # Get the embedding of the current input word (last output word)
        embedded = self.embed(input).unsqueeze(0)  # (1,B,N)
        embedded = self.dropout(embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attention(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
        context = context.transpose(0, 1)  # (1,B,N)
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat([embedded, context], 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,N) -> (B,N)
        context = context.squeeze(0)
        output = self.out(torch.cat([output, context], 1))
        output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.output_size
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()

        encoder_output, hidden = self.encoder(src)
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(trg.data[0, :])  # sos
        for t in range(1, max_len):
            output, hidden, attn_weights = self.decoder(
                    output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1] # TODO : beam search here
            output = Variable(trg.data[t] if is_teacher else top1).cuda()
        return outputs

    
class DoubleTranslator(nn.Module):
    def __init__(self, common_encoder, first_lang_decoder, second_lang_decoder):
        super(DoubleTranslator, self).__init__()
        self.common_encoder = common_encoder
        self.first_lang_decoder = first_lang_decoder
        self.second_lang_decoder = second_lang_decoder
        self.is_from_first_lang_to_second = True
        
    def set_is_from_first_lang_to_second(self, value):
        self.is_from_first_lang_to_second = value

    def get_is_from_first_lang_to_second(self):
        return self.is_from_first_lang_to_second
    
    def forward_one_lang(self, src, trg, teacher_forcing_ratio=0.5, is_first_lang = True):
        batch_size = src.size(1)
        max_len = trg.size(0)

        # if is_first_lang: en_word -> encoder -> ru_decoder -> encoder -> en_decoder -> en_word
        decoder = self.first_lang_decoder if not is_first_lang else self.second_lang_decoder
        
        vocab_size = decoder.output_size
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()
        
        encoder_output, hidden = self.common_encoder(src)
        hidden = hidden[:decoder.n_layers]
        output = Variable(trg.data[0, :])  # sos
        for t in range(1, max_len):
            output, hidden, attn_weights = decoder(output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1] # TODO : beam search here
            output = Variable(trg.data[t] if is_teacher else top1).cuda()
            
        return outputs
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        output_first_lang = self.forward_one_lang(src, trg, teacher_forcing_ratio, self.get_is_from_first_lang_to_second())
        return self.forward_one_lang(src, trg, teacher_forcing_ratio, not self.get_is_from_first_lang_to_second())
    

In [4]:
def batch(iterable, batch_size = 1):
    all_length = len(iterable)
    for ndx in range(0, all_length, batch_size):
        yield iterable[ndx:min(ndx + batch_size, all_length)]

def get_vocab(emb_plain):
    result = {}
    for i, line in enumerate(emb_plain):
        word, vector = line.split(' ', 1)
        result[word] = len(result)  

    return result

def get_embeddings(emb_plain, emb_size):
    result = np.ndarray((len(emb_plain), emb_size), dtype='float32')
    for i, line in enumerate(emb_plain):
        word, vector = line.split(' ', 1)
        result[i] = vector.split()  
    return result
        
def evaluate(model, val_iter, vocab_size, vocab):
    model.eval()
    pad = PAD_INDEX
    total_loss = 0
    for b, batch in enumerate(val_iter):
        src = batch.src
        trg = batch.trg
        src = Variable(src.data.cuda(), volatile=True)
        trg = Variable(trg.data.cuda(), volatile=True)
        output = model(src, trg)
        loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        total_loss += loss.data[0]
    return total_loss / len(val_iter)

class TrainData:
    def __init__(self, model, train_iter, vocab_size, vocab):
        self.model = model
        self.train_iter = train_iter
        self.vocab_size = vocab_size
        self.vocab = vocab

def train(e, train_data1, train_data2, grad_clip, batch_size, optimizer):
    train_data1.model.train()
    train_data2.model.train()
    
    total_loss = 0
    pad = PAD_INDEX
    
    common_length = min(len(train_data1.train_iter), len(train_data2.train_iter))
    common_data = list(zip(train_data1.train_iter[:common_length], train_data2.train_iter[:common_length]))
    
    def get_loss(sents, train_data1):
        max_length_in_batch1 = max([len(s) for s in sents1])
        
        t1 = np.array([sents2inds(sent, max_length_in_batch1, train_data1.vocab) for sent in sents1])
        src1 = Variable(torch.from_numpy(t1.T))
        trg1 = Variable(torch.from_numpy(t1.T))
        src1, trg1 = src1.cuda(), trg1.cuda()
        
        optimizer.zero_grad()
        output1 = train_data1.model(src1, trg1)

        loss1 = F.cross_entropy(output1[1:].view(-1, train_data1.vocab_size),
                               trg1[1:].contiguous().view(-1),
                               ignore_index=pad)
        return loss1
    
    for b, sents in enumerate(batch(common_data, batch_size=batch_size)):
        sents1 = [s[0] for s in sents]
        sents2 = [s[1] for s in sents]

        optimizer.zero_grad()

        loss = get_loss(sents1, train_data1) + get_loss(sents2, train_data2) 
        loss.backward()
        
        clip_grad_norm(train_data1.model.parameters(), grad_clip)
        clip_grad_norm(train_data2.model.parameters(), grad_clip)
        
        optimizer.step()
        total_loss += loss.data[0]

        if b % 100 == 0 and b != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][pp:%5.2f]" %
                  (b, total_loss, math.exp(total_loss)))
            total_loss = 0


def train_trans(e, model, optimizer, train_iter,
                    vocab_size, grad_clip, vocab, batch_size):
    model.train()
    total_loss = 0
    pad = PAD_INDEX
    
    for b, sents in enumerate(batch(train_iter, batch_size=batch_size)):
        max_length_in_batch = max([len(s) for s in sents])
        
        t = np.array([sents2inds(sent, max_length_in_batch, vocab) for sent in sents])
        src = Variable(torch.from_numpy(t.T))
        trg = Variable(torch.from_numpy(t.T))
        
        src, trg = src.cuda(), trg.cuda()
        optimizer.zero_grad()
        output = model(src, trg)
        loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.data[0]

        if b % 100 == 0 and b != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][pp:%5.2f]" %
                  (b, total_loss, math.exp(total_loss)))
            total_loss = 0

            
def create_ind2word(vocab):
    return {item[1]:item[0] for item in vocab.items()}
            
def create_emb_and_dict_by_emb_and_dict(vocab, embeddings, ind2word, dict_plain):
    result_emb = []
    result_dict = {}
    
    result_emb2 = []
    result_dict2 = {}
    
    res_embedding_inds = []
    
    # XXX : first 3 items just copy : SOS EOS ...
#     count = 3
#     for i in range(count):
#         en_word = ind2word.get(i, -1)
#         if en_word == -1:
#             continue
            
#         result_emb.append(embeddings[i])
#         result_dict[en_word] = len(result_dict)
        
#         result_emb2.append(embeddings[i])
#         result_dict2[en_word] = len(result_dict)
    
    for word_pair in dict_plain:
        en_word, ru_word = word_pair.split()
        
        if en_word not in vocab or ru_word in result_emb:
            continue
        
        if en_word in result_dict2:
            result_dict[ru_word] = result_dict2[en_word]
            continue
        
        res_embedding_inds.append(vocab[en_word])
        result_dict[ru_word] = len(result_dict2)
        result_dict2[en_word] = len(result_dict2)
        
    result_emb = embeddings[res_embedding_inds]
    return result_dict, result_emb, result_dict2, result_emb

def sents2inds(sents, max_length, vocab):
    pad_ind = PAD_INDEX
    return np.pad(np.array([vocab.get(i, UNK_INDEX) for i in sents]),
                (0, max_length - len(sents)),  mode='constant', constant_values=(pad_ind))
    
# def main():
if True:    
    with open('glove/glove.6B.50d.txt', 'rt') as emb_file:
        en_emb_plain = emb_file.readlines()

    with open('en-ru.txt', 'rt') as dict_file:
        dict_en_ru_plain = dict_file.readlines()
        
    en_vocab = get_vocab(en_emb_plain)
    en_embeddings = get_embeddings(en_emb_plain, 50)
    en_ind2word = create_ind2word(en_vocab)
    
    ru_vocab, ru_embeddings, en_vocab, en_embeddings = create_emb_and_dict_by_emb_and_dict(en_vocab, en_embeddings, en_ind2word, dict_en_ru_plain)
#     ru_embeddings = en_embeddings = None

    en_size = len(en_vocab)
    ru_size = len(ru_vocab)
    
    common_size = min(en_size, ru_size)
    
    print("[ru_vocab]:%d [en_vocab]:%d [common_size]:%d" % (ru_size, en_size, common_size))

    en_size = common_size
    ru_size = common_size

    
    epochs = 10
    grad_clip = 10.0
    lr = 0.0001
    en_size = len(en_emb_plain)
    
    batch_size = 5
    hidden_size = 64
    embed_size = 50
    assert torch.cuda.is_available()

    print("[!] preparing dataset...")
    
    first_lang_train_iter = load_dataset(batch_size, 'corpus1_10k.txt')
    second_lang_train_iter = load_dataset(batch_size, 'corpus2_10k.txt')
            

    print("[!] Instantiating models...")

    common_encoder = Encoder(en_size, embed_size, hidden_size, copy.deepcopy(en_embeddings), n_layers=2, dropout=0.5)
    
    # TODO : with None instead of en_embeddings it works
    first_lang_decoder = Decoder(embed_size, hidden_size, en_size, en_embeddings, n_layers=1, dropout=0.5)
    second_lang_decoder = Decoder(embed_size, hidden_size, ru_size, en_embeddings, n_layers=1, dropout=0.5)

    first_lang_seq2seq = Seq2Seq(common_encoder, first_lang_decoder).cuda()
    second_lang_seq2seq = Seq2Seq(common_encoder, second_lang_decoder).cuda()
    double_translator = DoubleTranslator(common_encoder, first_lang_decoder, second_lang_decoder).cuda()

#     first_lang_optimizer = optim.Adam(first_lang_seq2seq.parameters(), lr=lr)
#     second_lang_optimizer = optim.Adam(second_lang_seq2seq.parameters(), lr=lr)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, double_translator.parameters()), lr=lr)

#     print(first_lang_seq2seq)
#     print(second_lang_seq2seq)
#     print(double_translator)


    first_train_data = TrainData(first_lang_seq2seq, first_lang_train_iter, en_size, en_vocab)
    second_train_data = TrainData(second_lang_seq2seq, second_lang_train_iter, ru_size, ru_vocab)

#     first_train_data = second_train_data,
    best_val_loss = None
    for e in range(1, epochs+1):
        train(e, first_train_data, second_train_data, grad_clip, batch_size, optimizer)

#         train(e, second_lang_seq2seq, second_lang_optimizer, second_lang_train_iter, ru_size, grad_clip, ru_vocab, batch_size)
#         train_trans(e, double_translator, decoder_optimizer, first_lang_train_iter,
#                     en_size, grad_clip, en_vocab, batch_size)
        
    # TODO: use val_iter here
#         first_lang_val_loss = evaluate(first_lang_seq2seq, shuffled_train_iter, en_size, en_vocab)
#         second_lang_val_loss = evaluate(second_lang_seq2seq, val_iter, en_size, DE, DE)
#         double_trans_val_loss = evaluate(double_translator, val_iter, en_size, DE, DE)

#         val_loss = first_lang_val_loss + second_lang_val_loss + double_trans_val_loss
        
        print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS" % (e, val_loss, math.exp(val_loss)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            print("[!] saving model...")
            if not os.path.isdir(".save"):
                os.makedirs(".save")
            torch.save(first_lang_seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (e))
            best_val_loss = val_loss
    test_loss = evaluate(first_lang_seq2seq, test_iter, en_size)
    print("[TEST] loss:%5.2f" % test_loss)



# main()


1
2
[ru_vocab]:44323 [en_vocab]:41976 [common_size]:41976
[!] preparing dataset...
[!] Instantiating models...


RuntimeError: cuda runtime error (59) : device-side assert triggered at /pytorch/torch/lib/THC/THCTensorCopy.cu:204