In [15]:
import os
import io
import json
import torch
import numpy as np
from collections import defaultdict
from torch.utils.data import Dataset
from nltk.tokenize import TweetTokenizer
import collections

In [9]:
data_file = '../data/clean_paraphrases.tsv'
q1_data = []

with open(data_file, 'r') as f:
    next(f)
    for line in f:
        line = line.strip().split('\t')
        q1_data.append(line[0])

In [10]:
len(q1_data)

149263

In [11]:
q1_data[:10]

['astrology : i am a capricorn sun cap moon and cap rising ... what does that say about me ?',
 'how can i be a good geologist ?',
 'how do i read and find my youtube comments ?',
 'what can make physics easy to learn ?',
 'what was your first sexual experience like ?',
 'what would a trump presidency mean for current international master ’ s students on an f1 visa ?',
 'what does manipulation mean ?',
 'why are so many quora users posting questions that are readily answered on google ?',
 'why do rockets look white ?',
 'how should i prepare for ca final law ?']

In [14]:
with open('data/qqp.train.txt', 'w') as f:
    for line in q1_data[:-4000]:
        f.write(line + '\n') 
        
with open('data/qqp.valid.txt', 'w') as f:
    for line in q1_data[-4000:]:
        f.write(line + '\n') 

In [27]:
class QQP(Dataset):

    def __init__(self, data_dir, split, create_data, **kwargs):

        super().__init__()
        self.data_dir = data_dir
        self.split = split
        self.max_sequence_length = kwargs.get('max_sequence_length', 20)
        self.min_occ = kwargs.get('min_occ', 3)

        self.raw_data_path = os.path.join(data_dir, 'qqp.'+ split +'.txt')
        self.data_file = 'qqp.'+ split +'.json'
        self.vocab_file = 'qqp.vocab.json'

        if create_data:
            print("Creating new %s qqp data."%split.upper())
            self._create_data()

        elif not os.path.exists(os.path.join(self.data_dir, self.data_file)):
            print("%s preprocessed file not found at %s. Creating new."%(split.upper(), os.path.join(self.data_dir, self.data_file)))
            self._create_data()

        else:
            self._load_data()


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        idx = str(idx)

        return {
            'input': np.asarray(self.data[idx]['input']),
            'target': np.asarray(self.data[idx]['target']),
            'length': self.data[idx]['length']
        }

    @property
    def vocab_size(self):
        return len(self.w2i)

    @property
    def pad_idx(self):
        return self.w2i['<pad>']

    @property
    def sos_idx(self):
        return self.w2i['<sos>']

    @property
    def eos_idx(self):
        return self.w2i['<eos>']

    @property
    def unk_idx(self):
        return self.w2i['<unk>']

    def get_w2i(self):
        return self.w2i

    def get_i2w(self):
        return self.i2w


    def _load_data(self, vocab=True):

        with open(os.path.join(self.data_dir, self.data_file), 'r') as file:
            self.data = json.load(file)
        if vocab:
            with open(os.path.join(self.data_dir, self.vocab_file), 'r') as file:
                vocab = json.load(file)
            self.w2i, self.i2w = vocab['w2i'], vocab['i2w']

    def _load_vocab(self):
        with open(os.path.join(self.data_dir, self.vocab_file), 'r') as vocab_file:
            vocab = json.load(vocab_file)

        self.w2i, self.i2w = vocab['w2i'], vocab['i2w']

    def _create_data(self):

        if self.split == 'train':
            self._create_vocab()
        else:
            self._load_vocab()

        tokenizer = TweetTokenizer(preserve_case=False)

        data = defaultdict(dict)
        with open(self.raw_data_path, 'r') as f:

            for i, line in enumerate(f):

                words = tokenizer.tokenize(line)

                input = ['<sos>'] + words
                input = input[:self.max_sequence_length]

                target = words[:self.max_sequence_length-1]
                target = target + ['<eos>']

                assert len(input) == len(target), "%d, %d"%(len(input), len(target))
                length = len(input)

                input.extend(['<pad>'] * (self.max_sequence_length-length))
                target.extend(['<pad>'] * (self.max_sequence_length-length))

                input = [self.w2i.get(w, self.w2i['<unk>']) for w in input]
                target = [self.w2i.get(w, self.w2i['<unk>']) for w in target]

                id = len(data)
                data[id]['input'] = input
                data[id]['target'] = target
                data[id]['length'] = length

        with io.open(os.path.join(self.data_dir, self.data_file), 'wb') as data_file:
            data = json.dumps(data, ensure_ascii=False)
            data_file.write(data.encode('utf8', 'replace'))

        self._load_data(vocab=False)

    def _create_vocab(self):

        assert self.split == 'train', "Vocablurary can only be created for training file."

        tokenizer = TweetTokenizer(preserve_case=False)

        wordcounts = collections.Counter()
        w2i = dict()
        i2w = dict()

        special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
        for st in special_tokens:
            i2w[len(w2i)] = st
            w2i[st] = len(w2i)

        with open(self.raw_data_path, 'r') as f:

            for line in f:
                for word in tokenizer.tokenize(line):
                    wordcounts[word] += 1
                
            for w, c in dict(wordcounts.most_common()).items():
                if c > self.min_occ and w not in special_tokens:
                    i2w[len(w2i)] = w
                    w2i[w] = len(w2i)

        assert len(w2i) == len(i2w)

        print("Vocablurary of %i keys created." %len(w2i))

        vocab = dict(w2i=w2i, i2w=i2w)
        with io.open(os.path.join(self.data_dir, self.vocab_file), 'wb') as vocab_file:
            data = json.dumps(vocab, ensure_ascii=False)
            vocab_file.write(data.encode('utf8', 'replace'))

        self._load_vocab()

In [29]:
train_data = QQP(
            data_dir='data',
            split='train',
            create_data=True,
            max_sequence_length=20,
            min_occ=3
        )


Creating new TRAIN qqp data.
Vocablurary of 11042 keys created.


In [30]:
train_data.vocab_size

11042

In [16]:
w2i = dict()
i2w = dict()

special_tokens = ['<pad>', '<unk>', '<sos>', '<eos>']
for st in special_tokens:
    i2w[len(w2i)] = st
    w2i[st] = len(w2i)

In [17]:
i2w

{0: '<pad>', 1: '<unk>', 2: '<sos>', 3: '<eos>'}

In [18]:
w2i

{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}

In [31]:
import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils

In [32]:
class VanillaVAE(nn.Module):
    def __init__(self, vocab_size, embedding_size, rnn_type, hidden_size, word_dropout, embedding_dropout, latent_size,
                sos_idx, eos_idx, pad_idx, unk_idx, max_sequence_length, num_layers=1, bidirectional=False):
        super().__init__()
        self.tensor = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.Tensor
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.max_sequence_length = max_sequence_length
        self.sos_idx = sos_idx
        self.eos_idx = eos_idx
        self.pad_idx = pad_idx
        self.unk_idx = unk_idx
        
        self.latent_size = latent_size
        
        self.rnn_type = rnn_type
        self.bidirectional = bidirectional
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.word_dropout_rate = word_dropout
        self.embedding_dropout = nn.Dropout(p=embedding_dropout)
        
        if rnn_type == 'rnn':
            rnn = nn.RNN
        elif rnn_type == 'gru':
            rnn = nn.GRU
        # elif rnn_type == 'lstm':
        #     rnn = nn.LSTM
        else:
            raise ValueError()
            
        self.encoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional,
                               batch_first=True)
        self.decoder_rnn = rnn(embedding_size, hidden_size, num_layers=num_layers, bidirectional=self.bidirectional,
                               batch_first=True)
        
        self.hidden_factor = (2 if bidirectional else 1) * num_layers
        
        self.hidden2mean = nn.Linear(hidden_size * self.hidden_factor, latent_size)
        self.hidden2logv = nn.Linear(hidden_size * self.hidden_factor, latent_size)
        self.latent2hidden = nn.Linear(latent_size, hidden_size * self.hidden_factor)
        self.outputs2vocab = nn.Linear(hidden_size * (2 if bidirectional else 1), vocab_size)
        
    
    def forward(self, input_sequence, length):
        batch_size = input_sequence.shape[0]
        sorted_lengths, sorted_idx = torch.sort(length, descending=True) # 按长度降序排列
        input_sequence = input_sequence(sorted_idx)
        
        # Encoder
        input_embedding = self.embedding(input_sequence)
        
        packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.data.tolist(), batch_first=True)
        
        _, hidden = self.encoder_rnn(packed_input)
        
        if self.bidirectional or self.num_layers > 1:
            # flatten hidden state
            hidden = hidden.view(batch_size, self.hidden_size*self.hidden_factor)
        else:
            hidden = hidden.squeeze()
            
        # Reparameterization
        mean = self.hidden2mean(hidden)
        logv = self.hidden2logv(hidden)
        std = torch.exp(0.5 * logv)
        
        z = torch.randn([batch_size, self.latent_size]).to(self.device)
        z = z * std + mean
        
        # Decoder 
        hidden = self.latent2hidden(z)

        if self.bidirectional or self.num_layers > 1:
            # unflatten hidden state
            hidden = hidden.view(self.hidden_factor, batch_size, self.hidden_size)
        else:
            hidden = hidden.unsqueeze(0)
        
        ## decoder input
        if self.word_dropout_rate > 0:
            # randomly replace decoder input with <unk>
            prob = torch.rand(input_sequence.shape).to(self.device)
            prob[(input_sequence.data - self.sos_idx) * (input_sequence.data - self.pad_idx) == 0] = 1
            decoder_input_sequence = input_sequence.clone()
            decoder_input_sequence[prob < self.word_dropout_rate] = self.unk_idx
            input_embedding = self.embedding(decoder_input_sequence)
        input_embedding = self.embedding_dropout(input_embedding)
        packed_input = rnn_utils.pack_padded_sequence(input_embedding, sorted_lengths.data.tolist(), batch_first=True)
        
        ## decoder forward pass
        outputs, _ = self.decoder_rnn(packed_input, hidden)

        ## process outputs
        padded_outputs = rnn_utils.pad_packed_sequence(outputs, batch_first=True)[0]
        padded_outputs = padded_outputs.contiguous()
        _, reversed_idx = torch.sort(sorted_idx)
        padded_outputs = padded_outputs[reversed_idx]
        b, s, _ = padded_outputs.size()
        
        ## project outputs to vocab
        logp = nn.functional.log_softmax(self.outputs2vocab(padded_outputs.view(-1, padded_outputs.shape[2])), dim=-1)
        logp = logp.view(b, s, self.embedding.num_embeddings)
        
        return logp, mean, logv, z
        
    pass

In [108]:
rnn = nn.RNN(300, 128, 1, batch_first=True)
input = torch.randn(32, 15, 300)
h0 = torch.randn(1, 32, 128)
output, hn = rnn(input, h0)

In [109]:
output.shape

torch.Size([32, 15, 128])

In [98]:
a = torch.tensor([[1, 4, 56, 32, 43, 2]])
p = torch.rand(a.shape)
p.shape

torch.Size([1, 6])

In [99]:
p

tensor([[0.6438, 0.4026, 0.4299, 0.7945, 0.3941, 0.5233]])

In [100]:
p[(a - 1) * (a - 2) == 0] = 1

In [102]:
p

tensor([[1.0000, 0.4026, 0.4299, 0.7945, 0.3941, 1.0000]])

In [89]:
a[True, False, True] = 1

In [80]:
a

tensor([ 0.0200, -0.0976, -0.3032])

In [65]:
inp = torch.randn(32, 15)

In [69]:
inp.size()

torch.Size([32, 15])

In [33]:
a = torch.randn(2, 3)

In [38]:
a.shape[0]

2

In [125]:
padded_outputs = rnn_utils.pad_packed_sequence(outputs, batch_first=True)[0]

The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.


AttributeError: 'Tensor' object has no attribute 'batch_sizes'

In [115]:
b = torch.LongTensor([[8, 10, 9, 5, 15], [2, 4, 0, 0, 0], [2, 5, 6, 0, 0]])

In [119]:
sorted_lengths, sorted_idx = torch.sort(torch.Tensor([5, 2, 3]), descending=True)

In [120]:
sorted_lengths

tensor([5., 3., 2.])

In [121]:
sorted_idx

tensor([0, 2, 1])

In [123]:
_, id = torch.sort(sorted_idx)

In [124]:
b[id]

tensor([[ 8, 10,  9,  5, 15],
        [ 2,  5,  6,  0,  0],
        [ 2,  4,  0,  0,  0]])

In [48]:
b.shape

torch.Size([5])

In [50]:
sorted_b = b[sorted_idx]

In [51]:
sorted_b

tensor([15, 10,  9,  8,  5])

In [55]:
sorted_lengths.data.tolist()

[15, 10, 9, 8, 5]

In [62]:
c = rnn_utils.pack_padded_sequence(torch.randn(32, 64), list(range(32, 0, -1)), batch_first=True)

In [64]:
c

PackedSequence(data=tensor([ 2.3198e-01,  2.5610e-01, -1.0233e-01, -2.2863e-01,  2.4880e-01,
         3.8480e-01,  1.0268e+00,  1.9357e-02,  1.4502e+00,  1.1072e+00,
        -1.6335e+00, -3.6078e-01, -2.2025e+00,  3.0946e-01, -7.7019e-01,
        -8.3409e-01, -8.7534e-01,  9.0640e-01,  5.1791e-01,  1.5382e+00,
         2.0041e+00,  7.1600e-01,  3.0528e-01, -1.7164e+00, -5.3990e-01,
         7.3630e-01, -1.4546e+00, -2.2681e-01, -1.4949e-01, -7.1822e-01,
        -2.9656e-01, -1.5430e+00, -8.4069e-01,  5.8350e-01, -5.9392e-02,
         9.1901e-01,  1.4210e+00,  2.6298e-01, -1.4229e-01,  3.5958e-01,
         2.9929e-01, -1.5335e+00,  7.2855e-01,  3.2172e-01, -3.6195e-01,
         1.1350e+00,  9.9018e-01,  1.2208e+00,  8.7524e-01,  9.8079e-01,
        -3.6620e-01,  1.2725e+00,  3.5991e-01,  5.9394e-01, -6.1896e-01,
        -3.8697e-01, -9.2111e-01,  5.2107e-01, -2.1829e-01,  1.6189e-01,
         6.5652e-01, -4.9673e-02, -1.9637e+00, -5.2909e-01,  4.2492e-01,
         7.4619e-01, -7.1681e-0