In [1]:
import math
import torch
import random
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F


class Encoder(nn.Module):
    def __init__(self, input_size, embed_size, hidden_size, embeddings,
                 n_layers=1, dropout=0.5):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embed_size = embed_size
        self.embed = nn.Embedding(input_size, embed_size)
        
        if embeddings is not None:
            self.embed.weight.data = torch.Tensor(embeddings)#.cuda()
#         self.embedding.weight.requires_grad = False
            
        self.gru = nn.GRU(embed_size, hidden_size, n_layers,
                          dropout=dropout, bidirectional=True)

    def forward(self, src, hidden=None):
        embedded = self.embed(src)
        outputs, hidden = self.gru(embedded, hidden)
        # sum bidirectional outputs
        outputs = (outputs[:, :, :self.hidden_size] +
                   outputs[:, :, self.hidden_size:])
        return outputs, hidden


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)  # [B*T*H]
        attn_energies = self.score(h, encoder_outputs)
        return F.softmax(attn_energies, dim=1).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        # [B*T*2H]->[B*T*H]
        energy = self.attn(torch.cat([hidden, encoder_outputs], 2))
        energy = energy.transpose(1, 2)  # [B*H*T]
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)  # [B*1*H]
        energy = torch.bmm(v, energy)  # [B*1*T]
        return energy.squeeze(1)  # [B*T]


class Decoder(nn.Module):
    def __init__(self, embed_size, hidden_size, output_size, embeddings,
                 n_layers=1, dropout=0.2):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embed = nn.Embedding(output_size, embed_size)
        
        if embeddings is not None:
            self.embed.weight.data = torch.Tensor(embeddings)#.cuda()
#         self.embedding.weight.requires_grad = False
        
        
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.attention = Attention(hidden_size)
        self.gru = nn.GRU(hidden_size + embed_size, hidden_size,
                          n_layers, dropout=dropout)
        self.out = nn.Linear(hidden_size * 2, output_size)

    def forward(self, input, last_hidden, encoder_outputs):
        # Get the embedding of the current input word (last output word)
        embedded = self.embed(input).unsqueeze(0)  # (1,B,N)
        embedded = self.dropout(embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attention(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
        context = context.transpose(0, 1)  # (1,B,N)
        # Combine embedded input word and attended context, run through RNN
        rnn_input = torch.cat([embedded, context], 2)
        output, hidden = self.gru(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,N) -> (B,N)
        context = context.squeeze(0)
        output = self.out(torch.cat([output, context], 1))
        output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        print(src)
        batch_size = src.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.output_size
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()

        encoder_output, hidden = self.encoder(src)
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(trg.data[0, :])  # sos
        for t in range(1, max_len):
            output, hidden, attn_weights = self.decoder(
                    output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1]
            output = Variable(trg.data[t] if is_teacher else top1).cuda()
        return outputs

    
class DoubleTranslator(nn.Module):
    def __init__(self, common_encoder, first_lang_decoder, second_lang_decoder):
        super(DoubleTranslator, self).__init__()
        self.common_encoder = common_encoder
        self.first_lang_decoder = first_lang_decoder
        self.second_lang_decoder = second_lang_decoder
        
        self.is_from_first_lang_to_second = True
        
    def set_is_from_first_lang_to_second(self, value):
        self.is_from_first_lang_to_second = value

    def forward_one_lang(self, src, trg, teacher_forcing_ratio=0.5, is_first_lang = True):
        batch_size = src.size(1)
        max_len = trg.size(0)
        vocab_size = self.decoder.output_size
        outputs = Variable(torch.zeros(max_len, batch_size, vocab_size)).cuda()

        decoder = self.first_lang_decoder if is_first_lang else self.second_lang_decoder
        
        encoder_output, hidden = self.common_encoder(src)
        hidden = hidden[:self.decoder.n_layers]
        output = Variable(trg.data[0, :])  # sos
        for t in range(1, max_len):
            output, hidden, attn_weights = self.decoder(output, hidden, encoder_output)
            outputs[t] = output
            is_teacher = random.random() < teacher_forcing_ratio
            top1 = output.data.max(1)[1]
            output = Variable(trg.data[t] if is_teacher else top1).cuda()
            
        return outputs
    
    
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        output_first_lang = self.forward_one_lang(src, trg, teacher_forcing_ratio, is_from_first_lang_to_second)
        return self.forward_one_lang(src, trg, teacher_forcing_ratio, not is_from_first_lang_to_second)
    
    
    

In [2]:
import os
import math
import argparse
import torch
from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm
from torch.nn import functional as F
# import model #import *#Encoder, Decoder, Seq2Seq, DoubleTranslator
from torchtext import datasets, data

from torchtext.data import Field, BucketIterator

import numpy as np

with open('glove/glove.6B.50d.txt', 'rt') as emb_file:
    en_emb_plain = emb_file.readlines()

    
def get_vocab(emb_plain):
    result = {}
    for i, line in enumerate(en_emb_plain):
#         if i > 4:
#             break
        word, vector = line.split(' ', 1)
        result[word] = len(result)  
    
    return result

en_vocab = get_vocab(en_emb_plain)

def get_embeddings(emb_plain, emb_size):
    result = np.ndarray((len(emb_plain), emb_size), dtype='float32')
    for i, line in enumerate(en_emb_plain):
#         if i > 4:
#             break
        word, vector = line.split(' ', 1)
        result[i] = vector.split()  
    return result

    
#     max_rank = max(lex.rank for lex in vocab if lex.has_vector)
#     vectors = np.ndarray((max_rank+1, vocab.vectors_length), dtype='float32')
#     for lex in vocab:
#         if lex.has_vector:
#             vectors[lex.rank] = lex.vector
#     return vectors

get_embeddings(en_emb_plain, 50)
    

def ugly_swap(array, vocab, percent_of_swaps = 0.5):
    assert percent_of_swaps < 1
    result = array.copy()
    count_of_swaps = (int)(percent_of_swaps * len(array))
    
    indeces = np.random.randint(array.shape[0], size=(count_of_swaps, 2))
    
    result = np.array([vocab.get(word, 1) for word in array])
    
    for index_pair in indeces:
        result[index_pair[0]], result[index_pair[1]] = result[index_pair[1]], result[index_pair[0]]
    return result

# ugly_swap(np.array([1,2,3,4, 5,6]), 0)

def load_mono_dataset(filename):
    EN = Field(include_lengths=True, init_token='<sos>', eos_token='<eos>',
               use_vocab = False, preprocessing=lambda sent : ugly_swap(np.array(sent), en_vocab, percent_of_swaps=0))

    train = datasets.TranslationDataset('./', exts=(filename, filename), fields=(EN, EN))

    # DE.build_vocab(train.src, min_freq=2)
    EN.build_vocab(train.scr, max_size=10000)

    return train.examples, EN

def parse_arguments():
    p = argparse.ArgumentParser(description='Hyperparams')
    p.add_argument('-epochs', type=int, default=100,
                   help='number of epochs for train')
    p.add_argument('-batch_size', type=int, default=32,
                   help='number of epochs for train')
    p.add_argument('-lr', type=float, default=0.0001,
                   help='initial learning rate')
    p.add_argument('-grad_clip', type=float, default=10.0,
                   help='initial learning rate')
    return p.parse_args()


def evaluate(model, val_iter, vocab_size, first_lang_field, second_lang_field):
    model.eval()
    pad = second_lang_field.vocab.stoi['<pad>']
    total_loss = 0
    for b, batch in enumerate(val_iter):
        src = batch.src
        trg = batch.trg
        src = Variable(src.data.cuda(), volatile=True)
        trg = Variable(trg.data.cuda(), volatile=True)
        output = model(src, trg)
        loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        total_loss += loss.data[0]
    return total_loss / len(val_iter)


def train(e, model, optimizer, train_iter, vocab_size, grad_clip, first_lang_field, second_lang_field):
    model.train()
    total_loss = 0
    pad = second_lang_field.vocab.stoi['<pad>']
    for b, batch in enumerate(train_iter):
        src = batch.src

        print('batch.src')
        print(batch.src)

        src = torch.from_numpy(src)

        trg = batch.trg

        trg = torch.from_numpy(batch.trg)
        
#         src = Variable(src.cuda(), volatile=True)
#         trg = Variable(trg.cuda(), volatile=True)
        
        src, trg = src.view(1,-1).cuda(), trg.view(1,-1).cuda()
        optimizer.zero_grad()
        output = model(src, trg)
        loss = F.cross_entropy(output[1:].view(-1, vocab_size),
                               trg[1:].contiguous().view(-1),
                               ignore_index=pad)
        loss.backward()
        clip_grad_norm(model.parameters(), grad_clip)
        optimizer.step()
        total_loss += loss.data[0]

        if b % 100 == 0 and b != 0:
            total_loss = total_loss / 100
            print("[%d][loss:%5.2f][pp:%5.2f]" %
                  (b, total_loss, math.exp(total_loss)))
            total_loss = 0


def main():
#     args = parse_arguments()

    epochs = 100
    batch_size = 1
    lr = 0.0001
    grad_clip = 10.0
    
    hidden_size = 512
    embed_size = 256
    assert torch.cuda.is_available()

    print("[!] preparing dataset...")
    
    shuffled_train_iter, EN = load_mono_dataset('corpus1_cutted.txt')
#     train_iter, val_iter, test_iter, DE, EN = load_dataset(batch_size)
#     shuffled_train_iter, shuffled_val_iter, shuffled_test_iter, DE, EN = load_dataset(batch_size)
    
#     de_size, en_size = len(DE.vocab), len(EN.vocab)
    
    print("[TRAIN]:%d" % len(shuffled_train_iter))
    
#     print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)"
#           % (len(train_iter), len(train_iter.dataset),
#              len(test_iter), len(test_iter.dataset)))
#     print("[DE_vocab]:%d [en_vocab]:%d" % (de_size, en_size))

    print("[!] Instantiating models...")
    
    en_size = len(en_emb_plain)
    
    embeddings = get_embeddings(en_emb_plain, 50)
    common_encoder = Encoder(en_size, embed_size, hidden_size, embeddings, n_layers=2, dropout=0.5)#, embeddings = embeddings)
    first_lang_decoder = Decoder(embed_size, hidden_size, en_size, embeddings, n_layers=1, dropout=0.5)#, embeddings = embeddings)
#     second_lang_decoder = Decoder(embed_size, hidden_size, en_size, n_layers=1, dropout=0.5)

    first_lang_seq2seq = Seq2Seq(common_encoder, first_lang_decoder).cuda()
#     second_lang_seq2seq = Seq2Seq(common_encoder, second_lang_decoder).cuda()
#     double_translator = DoubleTranslator(common_encoder, first_lang_decoder, second_lang_decoder).cuda()

    first_lang_optimizer = optim.Adam(first_lang_seq2seq.parameters(), lr=lr)
#     second_lang_optimizer = optim.Adam(second_lang_seq2seq.parameters(), lr=lr)
#     decoder_optimizer = optim.Adam(double_translator.parameters(), lr=lr)

#     print(first_lang_seq2seq)
#     print(second_lang_seq2seq)
#     print(double_translator)

    best_val_loss = None
    for e in range(1, epochs+1):
        train(e, first_lang_seq2seq, first_lang_optimizer, shuffled_train_iter, en_size, grad_clip, EN, EN)
#         train(e, second_lang_seq2seq, second_lang_optimizer, shuffled_train_iter, en_size, grad_clip, DE, DE)
#         train(e, double_translator, decoder_optimizer, shuffled_train_iter, en_size, grad_clip, DE, DE)
        
    # TODO: use val_iter here
        first_lang_val_loss = evaluate(first_lang_seq2seq, shuffled_train_iter, en_size, EN, EN)
#         second_lang_val_loss = evaluate(second_lang_seq2seq, val_iter, en_size, DE, DE)
#         double_trans_val_loss = evaluate(double_translator, val_iter, en_size, DE, DE)

        val_loss = first_lang_val_loss # + second_lang_val_loss + double_trans_val_loss
        
        print("[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS" % (e, val_loss, math.exp(val_loss)))

        # Save the model if the validation loss is the best we've seen so far.
        if not best_val_loss or val_loss < best_val_loss:
            print("[!] saving model...")
            if not os.path.isdir(".save"):
                os.makedirs(".save")
            torch.save(first_lang_seq2seq.state_dict(), './.save/seq2seq_%d.pt' % (e))
            best_val_loss = val_loss
    test_loss = evaluate(first_lang_seq2seq, test_iter, en_size, DE, EN)
    print("[TEST] loss:%5.2f" % test_loss)


main()


[!] preparing dataset...
[TRAIN]:1000
[!] Instantiating models...
batch.src
[    1     1  2582 74426     1]

     1      1   2582  74426      1
[torch.cuda.LongTensor of size 1x5 (GPU 0)]



RuntimeError: save_for_backward can only save input or output tensors, but argument 0 doesn't satisfy this condition