In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
import random
import time
import math
import numpy as np
import copy
import os

torch.manual_seed(1)

<torch._C.Generator at 0x7f48dc119798>

In [2]:
USE_CUDA=torch.cuda.is_available()
USE_CUDA

False

# Prepare Data

In [44]:
DIR_PATH='../../dataset/corpus/bAbI/en-10k/'
flist = os.listdir(DIR_PATH)

In [45]:
flatten = lambda l: [item for sublist in l for item in sublist]

In [46]:
data=[]

In [47]:
for f in flist:
    if f.endswith('train.txt'):
        fname = DIR_PATH+f
        print(fname)
        temp = open(fname,'r',encoding='utf-8').readlines()
        temp = [t[:-1] for t in temp]
        data.extend(temp)

../../dataset/corpus/bAbI/en-10k/qa3_three-supporting-facts_train.txt
../../dataset/corpus/bAbI/en-10k/qa17_positional-reasoning_train.txt
../../dataset/corpus/bAbI/en-10k/qa18_size-reasoning_train.txt
../../dataset/corpus/bAbI/en-10k/qa11_basic-coreference_train.txt
../../dataset/corpus/bAbI/en-10k/qa1_single-supporting-fact_train.txt
../../dataset/corpus/bAbI/en-10k/qa12_conjunction_train.txt
../../dataset/corpus/bAbI/en-10k/qa14_time-reasoning_train.txt
../../dataset/corpus/bAbI/en-10k/qa13_compound-coreference_train.txt
../../dataset/corpus/bAbI/en-10k/qa10_indefinite-knowledge_train.txt
../../dataset/corpus/bAbI/en-10k/qa9_simple-negation_train.txt
../../dataset/corpus/bAbI/en-10k/qa5_three-arg-relations_train.txt
../../dataset/corpus/bAbI/en-10k/qa16_basic-induction_train.txt
../../dataset/corpus/bAbI/en-10k/qa4_two-arg-relations_train.txt
../../dataset/corpus/bAbI/en-10k/qa15_basic-deduction_train.txt
../../dataset/corpus/bAbI/en-10k/qa7_counting_train.txt
../../dataset/corpus/b

In [3]:
data = open('../../dataset/corpus/bAbI/en/qa16_basic-induction_train.txt','r').readlines()
data = [d[:-1] for d in data]

In [48]:
train=[]
support=[]
qa=[]

In [49]:
for d in data:

    index = d.split(' ')[0]
    if index=='1':
        support=[]
        qa=[]
        
    if '?' in d:
        temp = d.split('\t')
        q = temp[0].strip().replace('?','').split(' ')[1:]
        a = temp[1]
        stemp = copy.deepcopy(support)
        train.append([stemp,(q,a)])
    else:
        tokens = d.replace('.','').split(' ')[1:]
        support.append(tokens)

In [50]:
random.shuffle(train)
len(train)

200000

In [51]:
train[0]

[[['Daniel', 'is', 'no', 'longer', 'in', 'the', 'bathroom'],
  ['Sandra', 'is', 'no', 'longer', 'in', 'the', 'bathroom'],
  ['John', 'is', 'in', 'the', 'hallway'],
  ['Mary', 'travelled', 'to', 'the', 'garden']],
 (['Is', 'John', 'in', 'the', 'bedroom'], 'no')]

In [52]:
word_to_ix={}
target_to_ix={}


for t in train:
    supports = t[0]
    q = t[1][0]
    a = t[1][1]
    
    s = flatten(supports)
    for word in s+q:
        if word.lower() not in list(word_to_ix.keys()):
            word_to_ix[word.lower()]=len(word_to_ix)
     
    if a.lower() not in list(target_to_ix.keys()):
        target_to_ix[a.lower()] = len(target_to_ix)
    
ix_to_word = {v:k for k,v in word_to_ix.items()}

In [57]:
len(word_to_ix)

145

In [11]:
def prepare_sequence(seq):
    idxs = list(map(lambda w: word_to_ix[w.lower()], seq))
    tensor = torch.LongTensor(idxs)
    if USE_CUDA: tensor = tensor.cuda()
    
    return Variable(tensor)

In [54]:
class RelationalNetwork(nn.Module):
    def __init__(self, input_vocab_size,target_vocab_size,hidden_size, n_layers=1):
        super(RelationalNetwork, self).__init__()
        
        self.input_vocab_size = input_vocab_size
        self.target_vocab_size = target_vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_vocab_size, hidden_size)
        self.lstm_o = nn.LSTM(hidden_size, hidden_size,num_layers=n_layers) # for objects
        self.lstm_q = nn.LSTM(hidden_size, hidden_size,num_layers=n_layers) # for question (separate LSTM)
        
        self.relu = nn.ReLU()
        self.reaky = nn.LeakyReLU()
        
        
        
        # g
        self.g_1 = nn.Linear(hidden_size*3,256) # obj+obj+question 
        self.g_2 = nn.Linear(256,256)
        self.g_3 = nn.Linear(256,256) # data is much smaller compared with paper.
        self.g_4 = nn.Linear(256,256)
        # f
        self.f_1 = nn.Linear(256,512)
        self.f_2 = nn.Linear(512,159)
        self.f_3 = nn.Linear(159,target_vocab_size)
        
    def init_hidden(self):
        
        hidden_h = Variable(torch.randn(self.n_layers, 1, self.hidden_size))
        hidden_c = Variable(torch.randn(self.n_layers, 1, self.hidden_size))
        
        if USE_CUDA:
            hidden_h = hidden_h.cuda()
            hidden_c = hidden_c.cuda()
       
        return (hidden_h,hidden_c)
    
    
    def forward(self, supports, question):
        
        # question embedding
        hidden_q = self.init_hidden()
        embedded = self.embedding(question)
        output, hidden_q = self.lstm_q(embedded.view(len(question),1,-1),hidden_q)
        q_embed = hidden_q[0]
        
        # object embedding
        obj=[]
        for support in supports:
            hidden_o = self.init_hidden()
            embedded = self.embedding(support)
            output, hidden_o = self.lstm_o(embedded.view(len(support),1,-1), hidden_o)
            obj.append(hidden_o[0])
        
        # g
        x_g_sum=0
        for o_i in range(len(obj)):
            for o_j in range(len(obj)):
                
                #if o_i==o_j: continue
                
                x_g = torch.cat((obj[o_i],obj[o_j],q_embed),2) # object1,object2, question
                x_g = x_g.squeeze(1)
                x_g = self.relu(self.g_1(x_g))
                x_g = self.relu(self.g_2(x_g))
                x_g = self.relu(self.g_3(x_g)) 
                x_g = self.relu(self.g_4(x_g))
                x_g_sum+=x_g # element-wise sum
        
        # f
        x_f = self.relu(self.f_1(x_g_sum))
        x_f = self.relu(self.f_2(x_f))
        x_f = self.f_3(x_f)
        
        out = F.log_softmax(x_f)
        
        return out

### Train

In [55]:
model = RelationalNetwork(len(word_to_ix),len(target_to_ix),32)
if USE_CUDA: model = model.cuda()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=0.036) # 논문에선 2*e^-4 .==. 0.036
losses=[]
STEP=30 # 100 

In [56]:
for step in range(STEP):
    step_losses=[]
    for index, (supports,qa) in enumerate(train):


        model.zero_grad()
        supports = list(map(lambda s:prepare_sequence(s),supports))
        question = prepare_sequence(qa[0])
        answer = target_to_ix[qa[1].lower()]
        answer = Variable(torch.LongTensor([answer]))
        if USE_CUDA: answer = answer.cuda()
        
        pred = model(supports,question)
        #print(pred)
        loss = loss_function(pred,answer)
        loss.backward()
        
        if USE_CUDA:
            loss = loss.data.cpu().numpy()[0]
        else:
            loss = loss.data.numpy()[0]
        
        losses.append(loss)
        step_losses.append(loss)
        torch.nn.utils.clip_grad_norm(model.parameters(), 5.0)

        optimizer.step()

        if index%100==0:
            #print(pred)
            print('STEP [%d/%d] EPOCH [%d/%d] LOSS %.4f' % (step+1,STEP,index,len(train),np.mean(losses)))
            losses=[]
    print('%d STEP LOSS %.4f' % (step+1,np.mean(step_losses)))      
    random.shuffle(train)

STEP [1/30] EPOCH [0/200000] LOSS 4.1563
STEP [1/30] EPOCH [100/200000] LOSS 63.0231
STEP [1/30] EPOCH [200/200000] LOSS 3.3209
STEP [1/30] EPOCH [300/200000] LOSS 3.3977
STEP [1/30] EPOCH [400/200000] LOSS 3.3747
STEP [1/30] EPOCH [500/200000] LOSS 3.3683
STEP [1/30] EPOCH [600/200000] LOSS 3.2013
STEP [1/30] EPOCH [700/200000] LOSS 3.4472
STEP [1/30] EPOCH [800/200000] LOSS 3.2229
STEP [1/30] EPOCH [900/200000] LOSS 3.1785
STEP [1/30] EPOCH [1000/200000] LOSS 3.3106
STEP [1/30] EPOCH [1100/200000] LOSS 3.2408
STEP [1/30] EPOCH [1200/200000] LOSS 3.2601
STEP [1/30] EPOCH [1300/200000] LOSS 3.3858
STEP [1/30] EPOCH [1400/200000] LOSS 3.2195
STEP [1/30] EPOCH [1500/200000] LOSS 3.3562
STEP [1/30] EPOCH [1600/200000] LOSS 3.3948
STEP [1/30] EPOCH [1700/200000] LOSS 3.2765
STEP [1/30] EPOCH [1800/200000] LOSS 3.3800
STEP [1/30] EPOCH [1900/200000] LOSS 3.4528
STEP [1/30] EPOCH [2000/200000] LOSS 3.2561
STEP [1/30] EPOCH [2100/200000] LOSS 3.4210
STEP [1/30] EPOCH [2200/200000] LOSS 3.3948

KeyboardInterrupt: 

### Test 

In [26]:
data = open('../../dataset/corpus/bAbI/en/qa16_basic-induction_test.txt','r',encoding='utf-8').readlines()
data = [d[:-1] for d in data]

In [27]:
test=[]
support=[]
qa=[]

In [28]:
for d in data:

    index = d.split(' ')[0]
    if index=='1':
        support=[]
        qa=[]
        
    if '?' in d:
        temp = d.split('\t')
        q = temp[0].strip().replace('?','').split(' ')[1:]
        a = temp[1]
        stemp = copy.deepcopy(support)
        test.append([stemp,(q,a)])
    else:
        tokens = d.replace('.','').split(' ')[1:]
        support.append(tokens)

In [29]:
len(test)

1000

In [34]:
accuracy=0

In [33]:
 for index, (supports,qa) in enumerate(test):

        supports = list(map(lambda s:prepare_sequence(s),supports))
        question = prepare_sequence(qa[0])
        answer = target_to_ix[qa[1]]
        pred = model(supports,question)
        v,i = torch.max(pred,1)
        if np.equal(i.view(-1).data.tolist()[0],answer):
            accuracy+=1

print(accuracy/len(test))

0.466


In [35]:
 for index, (supports,qa) in enumerate(train):

        supports = list(map(lambda s:prepare_sequence(s),supports))
        question = prepare_sequence(qa[0])
        answer = target_to_ix[qa[1]]
        pred = model(supports,question)
        v,i = torch.max(pred,1)
        if np.equal(i.view(-1).data.tolist()[0],answer):
            accuracy+=1

print(accuracy/len(test))

0.513
