In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
def loads():
    pass
import numpy as np
import os
import json
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import torchtext

use_cuda = torch.cuda.is_available()
from process_data import save_pickle, load_pickle, load_task, load_glove_weights
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway
from config import Config
loads()

In [7]:
train_data = load_task('./dataset/train-v1.1.json')
# dev_data = load_task('./dataset/dev-v1.1.json')
data = train_data # + dev_data
# save_pickle(train_data, 'pickle/train_data.pickle')
# save_pickle(dev_data, 'pickle/dev_data.pickle')

vocab_w, vocab_c = set(), set()
for ctx_w, ctx_c, q_id, q_w, q_c, answer in data:
    vocab_w |= set(ctx_w + q_w + answer)
    flatten_c = [c for chars in ctx_c for c in chars]
    flatten_q = [c for chars in q_c for c in chars]

    vocab_c |= set(flatten_c + flatten_q) # TODO

vocab_w = list(sorted(vocab_w))
vocab_c = list(sorted(vocab_c))

w2i_w = dict((w, i) for i, w in enumerate(vocab_w, 0))
i2w_w = dict((i, w) for i, w in enumerate(vocab_w, 0))
w2i_c = dict((c, i) for i, c in enumerate(vocab_c, 0))
i2w_c = dict((i, c) for i, c in enumerate(vocab_c, 0))
# save_pickle(vocab, 'pickle/vocab.pickle')
# save_pickle(w2i, 'pickle/w2i.pickle')
# save_pickle(i2w, 'pickle/i2w.pickle')
# train_data = load_pickle('pickle/train_data.pickle')
# vocab = load_pickle('pickle/vocab.pickle')
# w2i = load_pickle('pickle/w2i.pickle')

vocab_size_w = len(vocab_w)
vocab_size_c = len(vocab_c)

ctx_sent_maxlen = max([len(c) for c, _, _, _, _, _ in data])
query_sent_maxlen = max([len(q) for _, _, _, q, _, _ in data])
ctx_word_maxlen = max([len(w) for _, cc, _, _, _, _ in data for w in cc])
query_word_maxlen = max([len(w) for _, _, _, _, qc, _ in data for w in qc])
print('----')
print('n_train', len(train_data))
# print('n_dev', len(dev_data))
print('vocab_size_w:', vocab_size_w)
print('vocab_size_c:', vocab_size_c)
print('ctx_sent_maxlen:', ctx_sent_maxlen)
print('query_sent_maxlen:', query_sent_maxlen)
print('ctx_word_maxlen:', ctx_word_maxlen)
print('query_word_maxlen:', query_word_maxlen)

dataset version: 1.1
load_task: 0 / 442
----
n_train 269
vocab_size_w: 2783
vocab_size_c: 89
ctx_sent_maxlen: 333
query_sent_maxlen: 25
ctx_word_maxlen: 22
query_word_maxlen: 14


In [10]:
embd_size = 100

In [11]:
glove_embd_w = torch.from_numpy(load_glove_weights('./dataset', embd_size, vocab_size_w, w2i_w))

Found 400000 word vectors.
embed_matrix.shape (2783, 100)


In [93]:
from layers.char_embedding import CharEmbedding
from layers.word_embedding import WordEmbedding
from layers.highway import Highway

args = {
    'embd_size': embd_size,
    'vocab_size_c': vocab_size_c,
    'vocab_size_w': vocab_size_w,
    'pre_embd_w': glove_embd_w,
    'filters': [[1, 5]], # char embedding
    'out_chs': 100, # char embedding
}
conf = Config(**args)
def make_word_vector(data, w2i_w, query_len):
    vec_data = []
    for sentence in data:
        index_vec = [w2i_w[w] for w in sentence]
        pad_len = max(0, query_len - len(index_vec))
        index_vec += [0] * pad_len
        index_vec = index_vec[:query_len]
        vec_data.append(index_vec)
    
    var = Variable(torch.LongTensor(vec_data))
    return var

def make_char_vector(data, w2i_c, query_len, word_len):
    tmp = torch.zeros(len(data), query_len, word_len).type(torch.LongTensor)
    for i, words in enumerate(data):
        for j, word in enumerate(words):
            for k, ch in enumerate(word):
                tmp[i][j][k] = w2i_c[ch]
    return Variable(tmp)
    
class AttentionNet(nn.Module):
    def __init__(self, args):
        super(AttentionNet, self).__init__()
        
        self.char_embd_net = CharEmbedding(args)
        self.word_embd_net = WordEmbedding(args)
    
    def forward(self, x_c, x_w):
        char_embd = self.char_embd_net(x_c) # (N, seq_len, embd_size)
        print('char_embd', char_embd.size())
        word_embd = self.word_embd_net(x_w) # (N, seq_len, embd_size)
        print('word_embd', word_embd.size())
        embd = torch.cat((char_embd, word_embd), 2) # (N, seq_len, embd_size*2)
        print('embd', embd.size())
        
        
def train(model, n_epoch=1, batch_size=16):
    for epoch in range(n_epoch):
        for i in range(0, len(data)-batch_size, batch_size): # TODO shuffle, last elms
            batch_data = data[i:i+batch_size]
            q = [d[3] for d in batch_data]
            qc = [d[4] for d in batch_data]
            word_var = make_word_vector(q, w2i_w, query_sent_maxlen)
            char_var = make_char_vector(qc, w2i_c, query_sent_maxlen, query_word_maxlen)
            model(char_var, word_var)
            break
            
attn = AttentionNet(conf)
train(attn)

word_var torch.Size([16, 25])
char_var torch.Size([16, 25, 14])
embd torch.Size([16, 25, 200])


In [14]:
print('q', data[0][3])
print('qc', data[0][4])

q ['To', 'whom', 'did', 'the', 'Virgin', 'Mary', 'allegedly', 'appear', 'in', '1858', 'in', 'Lourdes', 'France', '?']
qc [['T', 'o'], ['w', 'h', 'o', 'm'], ['d', 'i', 'd'], ['t', 'h', 'e'], ['V', 'i', 'r', 'g', 'i', 'n'], ['M', 'a', 'r', 'y'], ['a', 'l', 'l', 'e', 'g', 'e', 'd', 'l', 'y'], ['a', 'p', 'p', 'e', 'a', 'r'], ['i', 'n'], ['1', '8', '5', '8'], ['i', 'n'], ['L', 'o', 'u', 'r', 'd', 'e', 's'], ['F', 'r', 'a', 'n', 'c', 'e'], ['?']]


In [15]:
embd_size = 100
n_out_ch = 100
filters = [[1, 5]]
tmp_data = data[0][4]
max_len = max([len(chars) for chars in tmp_data])
tmp_var = torch.zeros(1, query_sent_maxlen, query_word_maxlen).type(torch.LongTensor)
print('tmp_var.size()=', tmp_var.size())
for i, chars in enumerate(tmp_data):
    for j, ch in enumerate(chars):
        tmp_var[0][i][j] = w2i_c[ch]
char_embd_net = CharEmbedding(vocab_size_c, embd_size, n_out_ch, filters)
print(char_embd_net)
out = char_embd_net(Variable(tmp_var))
print(out)
print('out', out.size())

tmp_var.size()= torch.Size([1, 25, 14])
CharEmbedding (
  (embedding): Embedding(89, 100)
  (conv): ModuleList (
    (0): Conv2d(1, 100, kernel_size=(1, 5), stride=(1, 1))
  )
  (dropout): Dropout (p = 0.5)
  (fc1): Linear (100 -> 1)
)
x torch.Size([1, 25, 14])
Variable containing:
( 0 ,.,.) = 
  247.3019  319.3441  262.0004  ...   330.8669  262.8564  279.7697
  290.2071  286.9221  323.5995  ...   282.1275  306.3464  255.2149
  208.0627  219.9793  210.1831  ...   224.5885  210.1546  189.7299
              ...                ⋱                ...             
  171.9374  220.1119  190.7723  ...   224.9468  207.3729  211.4682
  229.2606  220.2159  259.0305  ...   202.1327  247.5456  205.9599
  234.0376  280.7905  357.0085  ...   291.7557  364.4121  227.2117
[torch.FloatTensor of size 1x25x100]

out torch.Size([1, 25, 100])


In [17]:
word_embd_net = WordEmbedding(vocab_size_w, embd_size, False, glove_embd_w)
word_var = Variable(torch.LongTensor([[w2i_w[w] for w in data[0][3]]]))
out = word_embd_net(word_var)
print(out.size())


out torch.Size([1, 14, 100])
torch.Size([1, 14, 100])


In [30]:
class Config(object):
    def __init__(self, **entries):
        self.__dict__.update(entries)
a={'test':2}
a = Config(**a)

In [42]:
test = [
    (['aaaa', 'bbb', 'ccc'], ['a','b','c']),
    (['aaaa', 'bbb', 'ccc'], ['a','b','c'])
]
test

[(['aaaa', 'bbb', 'ccc'], ['a', 'b', 'c']),
 (['aaaa', 'bbb', 'ccc'], ['a', 'b', 'c'])]

In [73]:
x = torch.randn(2, 3)
y = torch.randn(2, 3)
print(x, y)


-0.7367  1.6024  1.0078
 0.0680  0.0471  2.3368
[torch.FloatTensor of size 2x3]
 
 0.4483 -1.5221 -0.9414
-0.2357 -0.0590  0.1027
[torch.FloatTensor of size 2x3]



In [75]:
torch.cat((x, y), 1)


-0.7367  1.6024  1.0078  0.4483 -1.5221 -0.9414
 0.0680  0.0471  2.3368 -0.2357 -0.0590  0.1027
[torch.FloatTensor of size 2x6]