In [1]:
%matplotlib inline

In [2]:
import importlib, utils2; importlib.reload(utils2)
from utils2 import *

Using TensorFlow backend.


In [3]:
np.set_printoptions(4)
cfg = K.tf.ConfigProto(gpu_options={'allow_growth':True})
K.set_session(K.tf.Session(config=cfg))

In [12]:
def tokenize(sent):
    return [x.strip() for x in re.split('(\W+)?', sent) if x.strip()]

In [13]:
def parse_stories(lines):
    data = []
    story = []
    for line in lines:
        line = line.decode('utf-8').strip()
        nid, line = line.split(' ', 1)
        if int(nid) == 1: story = []
        if '\t' in line:
            q, a, supporting = line.split('\t')
            q = tokenize(q)
            substory = None
            substory = [[str(i)+":"]+x for i,x in enumerate(story) if x]
            data.append((substory, q, a))
            story.append('')
        else: story.append(tokenize(line))
    return data

In [14]:
path = get_file('babi-tasks-v1-2.tar.gz', 
               origin = 'https://s3.amazonaws.com/text-datasets/babi_tasks_1-20_v1-2.tar.gz')
tar = tarfile.open(path)

In [15]:
challenges = {
    'single_supporting_fact_10k': 'tasks_1-20_v1-2/en-10k/qa1_single-supporting-fact_{}.txt',
    'two_supporting_facts_10k': 'tasks_1-20_v1-2/en-10k/qa2_two-supporting-facts_{}.txt',
    'two_supporting_facts_1k': 'tasks_1-20_v1-2/en/qa2_two-supporting-facts_{}.txt',
}
challenge_type = 'single_supporting_fact_10k'
challenge = challenges[challenge_type]

In [16]:
def get_stories(f):
    data = parse_stories(f.readlines())
    return [(story, q, answer) for story, q, answer in data]

In [17]:
train_stories = get_stories(tar.extractfile(challenge.format('train')))
test_stories = get_stories(tar.extractfile(challenge.format('test')))

  return _compile(pattern, flags).split(string, maxsplit)


In [18]:
test_stories[0]

([['0:', 'John', 'travelled', 'to', 'the', 'hallway', '.'],
  ['1:', 'Mary', 'journeyed', 'to', 'the', 'bathroom', '.']],
 ['Where', 'is', 'John', '?'],
 'hallway')

In [19]:
stories = train_stories + test_stories

In [20]:
story_maxlen = max((len(s) for x, _, _ in stories for s in x))
story_maxsents = max((len(x) for x, _, _ in stories))
query_maxlen = max(len(x) for _, x, _ in stories)

In [23]:
def do_flatten(el):
    return isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes))
def flatten(l):
    for el in l:
        if do_flatten(el): yield from flatten(el)
        else: yield el

In [24]:
vocab = sorted(set(flatten(stories)))

['.',
 '0:',
 '10:',
 '12:',
 '13:',
 '1:',
 '3:',
 '4:',
 '6:',
 '7:',
 '9:',
 '?',
 'Daniel',
 'John',
 'Mary',
 'Sandra',
 'Where',
 'back',
 'bathroom',
 'bedroom',
 'garden',
 'hallway',
 'is',
 'journeyed',
 'kitchen',
 'moved',
 'office',
 'the',
 'to',
 'travelled',
 'went']

In [26]:
vocab.insert(0, '<PAD>')

In [27]:
# vocab

['<PAD>',
 '.',
 '0:',
 '10:',
 '12:',
 '13:',
 '1:',
 '3:',
 '4:',
 '6:',
 '7:',
 '9:',
 '?',
 'Daniel',
 'John',
 'Mary',
 'Sandra',
 'Where',
 'back',
 'bathroom',
 'bedroom',
 'garden',
 'hallway',
 'is',
 'journeyed',
 'kitchen',
 'moved',
 'office',
 'the',
 'to',
 'travelled',
 'went']

In [29]:
vocab_size = len(vocab); vocab_size

32

In [32]:
test_stories[530]

([['0:', 'Mary', 'moved', 'to', 'the', 'office', '.'],
  ['1:', 'John', 'moved', 'to', 'the', 'garden', '.']],
 ['Where', 'is', 'John', '?'],
 'garden')

In [33]:
word_idx = dict((c, i) for i, c in enumerate(vocab))

In [36]:
word_idx

{'.': 1,
 '0:': 2,
 '10:': 3,
 '12:': 4,
 '13:': 5,
 '1:': 6,
 '3:': 7,
 '4:': 8,
 '6:': 9,
 '7:': 10,
 '9:': 11,
 '<PAD>': 0,
 '?': 12,
 'Daniel': 13,
 'John': 14,
 'Mary': 15,
 'Sandra': 16,
 'Where': 17,
 'back': 18,
 'bathroom': 19,
 'bedroom': 20,
 'garden': 21,
 'hallway': 22,
 'is': 23,
 'journeyed': 24,
 'kitchen': 25,
 'moved': 26,
 'office': 27,
 'the': 28,
 'to': 29,
 'travelled': 30,
 'went': 31}

In [37]:
def vectorize_stories(data, word_idx, story_maxlen, query_maxlen):
    X = []; Xq = []; Y = []
    for story, query , answer in data:
        x = [[word_idx[w] for w in s] for s in story]
        xq = [word_idx[w] for w in query]
        y = [word_idx[answer]]
        X.append(x); Xq.append(xq); Y.append(y)
    return ([pad_sequences(x, maxlen=story_maxlen) for x in X],
           pad_sequences(Xq, maxlen=query_maxlen), np.array(Y))

In [38]:
inputs_train, queries_train, answers_train = vectorize_stories(train_stories, 
                                                              word_idx, story_maxlen, query_maxlen)
input_test, queries_test, answers_test = vectorize_stories(test_stories, 
                                                          word_idx, story_maxlen, query_maxlen)

In [39]:
def stack_inputs(inputs):
    for i, it in enumerate(inputs):
        inputs[i] = np.concatenate([it, 
                                   np.zeros((story_maxsents-it.shape[0], story_maxlen), 'int')])
    return np.stack(inputs)

[array([[ 0,  2, 15, 26, 29, 28, 19,  1],
        [ 0,  6, 14, 31, 29, 28, 22,  1]], dtype=int32),
 array([[ 0,  2, 15, 26, 29, 28, 19,  1],
        [ 0,  6, 14, 31, 29, 28, 22,  1],
        [ 7, 13, 31, 18, 29, 28, 22,  1],
        [ 0,  8, 16, 26, 29, 28, 21,  1]], dtype=int32),
 array([[ 0,  2, 15, 26, 29, 28, 19,  1],
        [ 0,  6, 14, 31, 29, 28, 22,  1],
        [ 7, 13, 31, 18, 29, 28, 22,  1],
        [ 0,  8, 16, 26, 29, 28, 21,  1],
        [ 0,  9, 14, 26, 29, 28, 27,  1],
        [ 0, 10, 16, 24, 29, 28, 19,  1]], dtype=int32),
 array([[ 0,  2, 15, 26, 29, 28, 19,  1],
        [ 0,  6, 14, 31, 29, 28, 22,  1],
        [ 7, 13, 31, 18, 29, 28, 22,  1],
        [ 0,  8, 16, 26, 29, 28, 21,  1],
        [ 0,  9, 14, 26, 29, 28, 27,  1],
        [ 0, 10, 16, 24, 29, 28, 19,  1],
        [ 0, 11, 15, 26, 29, 28, 22,  1],
        [ 0,  3, 13, 30, 29, 28, 27,  1]], dtype=int32),
 array([[ 0,  2, 15, 26, 29, 28, 19,  1],
        [ 0,  6, 14, 31, 29, 28, 22,  1],
        [ 7, 13,

In [43]:
inputs_train = stack_inputs(inputs_train)
input_test = stack_inputs(input_test)

In [46]:
inps = [inputs_train, queries_train]
val_inps = [input_test, queries_test]