In [55]:
import numpy as np
import theano
import theano.tensor as T
import lasagne

import sys
import time
import utils
import config
import logging
import main
import nn_layers

In [3]:
train_file = "../data/cnn/train.txt"
dev_file =  "../data/cnn/dev.txt" 
embedding_file = "../data/glove/glove.6B.100d.txt"

In [71]:
# relabling makes all entity in each passage starts from 0
logging.info('*' * 10 + ' Train')
train_examples = utils.load_data(train_file, relabeling=True)
logging.info('*' * 10 + ' Dev')
dev_examples = utils.load_data(dev_file, relabeling=True)

<b>train_examples is a 3-tuple: passage, question, answer

In [72]:
# the first passage
train_examples[0][0]

'days after two @entity0 journalists were killed in northern @entity1 , authorities rounded up dozens of suspects and a group linked to @entity2 claimed responsibility for the deaths . at least 30 suspects were seized in desert camps near the town of @entity3 and taken to the local @entity0 army base for questioning , three officials in @entity1 said . the officials did not want to be named because they are not authorized to talk to the media . @entity4 ( @entity4 ) has allegedly claimed responsibility for the killings , according to @entity5 news agency in @entity6 . @entity4 operates in northern @entity7 and the group \'s statements have shown up before on the @entity8 outlet . @entity9 journalists @entity10 and @entity11 were abducted in front of the home of a member of the @entity12 rebels \' @entity13 of a @entity14 on saturday , @entity9 reported . they were found dead the same day . their bodies arrived in @entity15 on tuesday . @entity3 was one of the strongholds of the @entity

In [15]:
# the first question
train_examples[1][0]

'officials : the suspects were taken to the local @placeholder army base for questioning'

In [16]:
# the first answer
train_examples[2][0]

'@entity0'

<b>build the dictionary for train-set words, we take the most 50000 often words as default.

Only the max_words ones are kept and the remaining will be mapped to <UNK>.

In [73]:
word_dict = utils.build_dict(train_examples[0] + train_examples[1])

In [76]:
# word: index(the rank of common)
# leave 0 to UNK
# leave 1 to delimiter |||
word_dict['@entity2']
print len(word_dict)

50000


<b>build the dictionary for train-set entity

In [74]:
entity_markers = list(set([w for w in word_dict.keys() if w.startswith('@entity')] + train_examples[2]))
entity_markers = ['<unk_entity>'] + entity_markers
entity_dict = {w: index for (index, w) in enumerate(entity_markers)}

In [75]:
entity_dict

{'<unk_entity>': 0,
 '@entity0': 147,
 '@entity1': 291,
 '@entity10': 203,
 '@entity100': 208,
 '@entity101': 209,
 '@entity102': 210,
 '@entity103': 211,
 '@entity104': 212,
 '@entity105': 213,
 '@entity106': 3,
 '@entity107': 102,
 '@entity108': 9,
 '@entity109': 326,
 '@entity11': 202,
 '@entity110': 246,
 '@entity111': 253,
 '@entity112': 252,
 '@entity113': 251,
 '@entity114': 240,
 '@entity115': 249,
 '@entity116': 248,
 '@entity117': 247,
 '@entity118': 259,
 '@entity119': 258,
 '@entity12': 201,
 '@entity120': 141,
 '@entity121': 142,
 '@entity122': 139,
 '@entity123': 140,
 '@entity124': 137,
 '@entity125': 138,
 '@entity126': 135,
 '@entity127': 136,
 '@entity128': 133,
 '@entity129': 134,
 '@entity13': 200,
 '@entity130': 178,
 '@entity131': 177,
 '@entity132': 180,
 '@entity133': 179,
 '@entity134': 174,
 '@entity135': 173,
 '@entity136': 176,
 '@entity137': 175,
 '@entity138': 170,
 '@entity139': 169,
 '@entity14': 199,
 '@entity140': 62,
 '@entity141': 63,
 '@entity142': 

<b>get the embedding data

In [77]:
embedding_file = "../data/glove/glove.6B.100d.txt"
embeddings = utils.gen_embeddings(word_dict, 100, embedding_file)

In [78]:
# the size of embedding 
(vocab_size, embedding_size) = embeddings.shape
print vocab_size
print embedding_size

50002
100


<b> use word_dict to vectorize train-set data

In [79]:
# train_x1, train_x2: sequences for document and question respecitvely.
# train_y: label 
# these three are the list which has 100(number of examples) elements
# train_l: whether the entity label occurs in the document.  shape is: len(examples[0]), len(entity_dict)
train_x1, train_x2, train_l, train_y = utils.vectorize(train_examples, word_dict, entity_dict)

In [80]:
print len(train_x1)
print len(train_x2)
print len(train_l)
print len(train_y)

380298
380298
380298
380298


<b>generate the real batch-size train examples

In [81]:
minibatches = utils.get_minibatches(len(train_x1), 32)

In [82]:
# each element is: (mb_x1, mb_mask1, mb_x2, mb_mask2, mb_l, mb_y)
# mb_x1 ... is a list with batch_size length
# mask: real value -> 1, none -> 0
batch_size = 32
all_train = main.gen_examples(train_x1, train_x2, train_l, train_y, batch_size)

(32, 52, 100)