In [2]:
import keras
from keras import Sequential
from keras.layers import Embedding, LSTM, Input, Dense, dot
from keras.models import Model

import numpy as np
import pandas as pd
import pickle

In [14]:
EXTERNAL_DATA_DIRECTORY = '../data/external'
PROCESSED_DATA_DIRECTORY = '../data/processed'

## Embedding words as GloVe vectors
Global Vectors for Word Representation (GloVe) are pre-trained vectors which puts semantically similar words in similar vector space. This set of pre-trained vectors were trained on Common Crawl data. We will embed our words as the GloVe vector if available and a random vector otherwise.

In [4]:
MAX_SEQUENCE_LENGTH, vocab_index = pickle.load(open(os.path.join(PROCESSED_DATA_DIRECTORY,'params.pkl'), 'rb'))
vocab_set = set(vocab_index.keys())
num_words = len(vocab_index) + 1
embedding_dim = 300 # Length of the Glove vectors

In [7]:
# Building a dictionary of words from our dataset embedded with GloVe vectors
embedded_words = {}

with open(os.path.join(EXTERNAL_DATA_DIRECTORY, 'glove.840B.300d.txt')) as f:
    for line in f:
        vals = line.split(' ')
        word = vals[0]
        if word in vocab_set:
            vector = np.asarray(vals[1:], 'float32')
            embedded_words[word] = vector

In [11]:
(len(vocab_index) - len(embedded_words))/len(vocab_index) # proportion of words not found in glove file

0.5723387076426211

In [13]:
# Building matrix of embedded word vectors. If there is no GloVe representation, the row is random.
embeddings_matrix = np.random.uniform(-0.25, 0.25, (num_words, embedding_dim))

for word, idx in vocab_index.items():
    embedded_word = embedded_words.get(word)
    if embedded_word is not None:
        embeddings_matrix[idx] = embedded_word

## Building the model
The dual encoder architecture below consists of the context branch and the utterance or response branch. Each branch is a separate recurrent neural network that encodes the embedded text sequences of the context and response.

<img src="../images/dual_encoder_architecture.png" alt="The dual encoder architecture" width="500"/>

The two encoded vectors are combined as $\sigma(c^TMr)$ where $c$ and $r$ are the encoded context and response and matrix $M$ is a learned model parameter. <br>
Then: $c^TM = r'$ can be thought of as a generated response. $r' \cdot r$ will yield a similarity vector.
And then applying the sigmoid function will return a probability. This is the probability that the response is the correct one to the context.

In [19]:
context_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
response_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

encoder = Sequential()
encoder.add(Embedding(input_dim=num_words,
                             output_dim=embedding_dim,
                             weights=[embeddings_matrix],
                             mask_zero=True))
encoder.add(LSTM(200))

context_encoded = encoder(context_input)
response_encoded = encoder(response_input)

generated_response = Dense(200, use_bias=False)(context_encoded) # c*M

logits = dot([generated_response, response_encoded], axes=1)

probs = Dense(1, activation='sigmoid')(logits)

model = Model(inputs=[context_input, response_input], outputs=[probs])

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

## Loading the processed data

Data was filtered and tokenized and saved in pickle files by running: `$ python3 utilities/prepare_data.py`

In [5]:
train_contexts, train_responses, train_labels = pickle.load(
                                            open(os.path.join(PROCESSED_DATA_DIRECTORY,'train.pkl'), 'rb'))
test_contexts, test_responses, test_labels = pickle.load(
                                            open(os.path.join(PROCESSED_DATA_DIRECTORY,'test.pkl'), 'rb'))
validation_contexts, validation_responses, validation_labels = pickle.load(
                                            open(os.path.join(PROCESSED_DATA_DIRECTORY,'valid.pkl'), 'rb'))

## Evaluation Metric

The test and validation sets contain the correct response to a given context as well as 9 false responses. The metric the paper uses is recall at k which is the proportion of test examples that contain the true response in the top $k$ predicted probabilities.

In [36]:
def recall_at_k(predictions, k=1, n=10):
    num_examples = float(len(predictions))/n
    num_correct = 0
    for i in range(0, len(predictions), n):
        test_case = predictions[i:i+n]
        # 0th example is always the ground truth utterance
        if 0 in test_case.argsort(axis=0,)[::-1][:k]:
            num_correct += 1
    return num_correct/num_examples

## Training
We check the recall at one in 10 each epoch to determine when to stop training.

In [21]:
# Used early stopping if Recall @ 1 decreased
old_recall = 0
for epoch in range(50):
    model.fit([train_contexts, train_responses], train_labels, 
              validation_data=([validation_contexts, validation_responses], [validation_labels]), 
              batch_size=256, epochs=1)
    
    y_pred = model.predict([validation_contexts, validation_responses])
    recall_at_one = recall_at_k(y_pred, k=1)
    print(recall_at_one, recall_at_k(y_pred, k=2), recall_at_k(y_pred, k=5), sep='\n')

    if recall_at_one <= old_recall:
        break
    
    old_recall = recall_at_one

Train on 718540 samples, validate on 130530 samples
Epoch 1/1
0.5225899975839575
0.7093500845614883
0.9257469598131594
Train on 718540 samples, validate on 130530 samples
Epoch 1/1
0.5614882821937666
0.7407586373520174
0.9331561568816944
Train on 718540 samples, validate on 130530 samples
Epoch 1/1
0.5515019731013933
0.732463558025288
0.9354111299025529


## Results

In [3]:
y_pred = model.predict([test_contexts, test_responses], verbose=1)

In [37]:
for k in [1, 2, 5]:
    print("recall @ %d in 10:" % k, recall_at_k(y_pred, k))

recall @ 1: 0.5515019731013933
recall @ 2: 0.732463558025288
recall @ 5: 0.9354111299025529


In [51]:
# Recall @ 1 in 2
y_pred2 = []
for i in range(0, len(y_pred), 10):
    y_pred2.append(y_pred[i])
    y_pred2.append(y_pred[i+1])
y_pred2 = np.array(y_pred2)

print("recall @ %d in 2:" % k, recall_at_k(y_pred2, 1, 2))

In [22]:
model.save('dual_encoder_model.h5')