In [2]:
import tensorflow as tf
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [1]:
import json
import random
with open('training.json') as json_file:  
    train_json = json.load(json_file)
with open('development.json') as json_file:  
    dev_json = json.load(json_file)
with open('testing.json') as json_file:  
    test_json = json.load(json_file)
with open('sample.json') as json_file:  
    sample_json = json.load(json_file)
    
def get_dic_from_two_lists(keys, values):
    return { keys[i] : values[i] for i in range(len(keys)) }

In [18]:
data = []
for article in dev_json['data']:
    for paragraph in article['paragraphs']:
        for qa in paragraph['qas']:
            context = paragraph['context']
            context = context.replace("''", '" ')
            context = context.replace("``", '" ')
            question = qa['question']
            answer = qa['answers'][0]
            ans_text = answer['text']
            ans_start = answer['answer_start']
            ans_end = answer['answer_start']+len(ans_text)
            data.append((context,question,ans_text,ans_start,ans_end))
            
contexts = [a for (a,b,c,d,e) in data]
questions = [b for (a,b,c,d,e) in data]
answers = [c for (a,b,c,d,e) in data]

In [177]:

def embedding(sequence,embeddings_index):
    max_len_func = lambda x: np.array([len(x) for x in sequence]).max()
    max_len = max_len_func(sequence)
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sequence)
    sequences = tokenizer.texts_to_sequences(sequence)
    word_index = tokenizer.word_index
    data = pad_sequences(sequences, maxlen=max_len)
    
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
    return embedding_matrix,max_len,data,word_index

In [178]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f: 
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

q_embed = embedding(questions,embeddings_index)
c_embed = embedding(contexts,embeddings_index)

Found 400000 word vectors.


In [180]:
from keras.layers import Embedding

embedding_layer_1 = Embedding(len(q_embed[3]) + 1,
                            100,
                            weights=[q_embed[0]],
                            input_length=q_embed[1],
                            trainable=False)
embedding_layer_2 = Embedding(len(c_embed[3]) + 1,
                            100,
                            weights=[c_embed[0]],
                            input_length=c_embed[1],
                            trainable=False)

In [181]:
contexts_max_length = np.array([len(x) for x in contexts]).max()
answer_one_hot = np.zeros((len(answers),contexts_max_length))
for i in range(len(contexts)):
    a_l = len(answers[i])
    for j in range(len(contexts[i])-a_l+1):
        if contexts[i][j]==answers[i][0] and contexts[i][j+a_l-1]==answers[i][a_l-1]:
            start_ix = j
            end_ix = j+a_l-1
            answer_one_hot[i][[start_ix,end_ix]]=1    

In [3]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers import concatenate

questions_input = Input(shape=(q_embed[1],))
embedded_questions = embedding_layer_1(questions_input)
_, state_h_q,state_c_q  = LSTM(80,return_state=True)(embedded_questions)
contexts_input = Input(shape=(c_embed[1],))
embedded_contexts = embedding_layer_2(contexts_input)
_, state_h_c, state_c_c = LSTM(80,return_state=True)(embedded_contexts)
H = concatenate([state_h_c,state_h_q,state_c_c,state_c_q])
preds = Dense(answer_one_hot.shape[1], activation='softmax')(H)

In [199]:
model = Model([contexts_input,questions_input], preds)
model.compile(loss='categorical_crossentropy',
              optimizer='RMSprop',
              metrics=['categorical_accuracy'])

x_train = [c_embed[2],q_embed[2]]
y_train = answer_one_hot
model.fit(x_train, y_train,
          epochs=2, batch_size=128)