In [None]:
import keras
import pandas as pd
import numpy as np
import os.path

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import optimizers
from keras import backend as K

DATA_PATH = "data"
NUMBER_SAMPLES = 1000 # ----- to test on a small dataset

In [None]:
# ----- Note: need to prepare datasets using https://github.com/rkadlec/ubuntu-ranking-dataset-creator.git to generate
# ----- the train.csv, test.csv files used in this script

# ----- load datasets 

train_df = pd.read_csv(os.path.join(DATA_PATH, "train.csv"))
test_df = pd.read_csv(os.path.join(DATA_PATH, "test.csv"))

if NUMBER_SAMPLES is not None:
    train_df = train_df.loc[np.random.choice(range(train_df.shape[0]), NUMBER_SAMPLES), ]
    test_df = test_df.loc[np.random.choice(range(train_df.shape[0]), NUMBER_SAMPLES/100), ]

train_data_response = list()
test_data_response=list()
valid_data_response=list()

# ----- fit tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.Context)

# ----- preprocess train data context, encode as sequence of ints, pad sequences to fixed length

train_data_context = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_df.Context.astype(str)), 
    maxlen=160, 
    dtype='int32', 
    padding='post', 
    truncating='post', 
    value=0.0
)

# ---- needed for model parameters

vocab_size = train_data_context.max()+1

# ----- preprocess train data response

train_data_response = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(train_df.Utterance.astype(str)),
    maxlen=160, 
    dtype='int32', 
    padding='post', 
    truncating='post', 
    value=0.0
)

# ----- train data labels

train_data_labels = train_df.Label.copy()

# ----- preprocess test dataset 

# ----- we tile the response since we have 9 false responses for each true label, each with same context

test_data_context = np.tile(
    keras.preprocessing.sequence.pad_sequences(
        tokenizer.texts_to_sequences(test_df.Context.astype(str)), 
        maxlen=160, 
        dtype='int32', 
        padding='post', 
        truncating='post',
        value=0.0), 
    (10,1)
)

# ----- preprocess test data response for true responses

test_data_response = keras.preprocessing.sequence.pad_sequences(
    tokenizer.texts_to_sequences(test_df["Ground Truth Utterance"].astype(str)), 
    maxlen=160, 
    dtype='int32', 
    padding='post', 
    truncating='post', 
    value=0.0
)

# ----- add in distractor (false responses, sampled randomly from dataset)

for r in range(9):
    test_data_distractor = tokenizer.texts_to_sequences(test_df["Distractor_{}".format(r)].astype(str))
    test_data_distractor = keras.preprocessing.sequence.pad_sequences(
        test_data_distractor, 
        maxlen=160, 
        dtype='int32', 
        padding='post', 
        truncating='post', 
        value=0.0
    )
    test_data_response = np.concatenate([test_data_response,test_data_distractor])

# ----- test labels  

test_data_labels = np.tile(np.array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0]), (test_data_response.shape[0]/10))

In [None]:
# Example 1: Siamese Dual LSTM for Slot-filling Chatbot

# ----- declare custom keras layer

class SimilarityLayer(keras.layers.Layer):
    """
    Custom "similarity" layer that computes inner product of the predicted (r') and real (r) response. See
    Lowe et al., Proceedings of the SIGDIAL 2015 Conference, p. 290 for details
    """

    def __init__(self, kernel_dim, **kwargs):
        self.kernel_dim = kernel_dim
        self.result = None
        super(SimilarityLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(name='kernel', 
                                      shape=(self.kernel_dim, self.kernel_dim),
                                      initializer='truncated_normal',
                                      trainable=True)
        super(SimilarityLayer, self).build(input_shape)

    def call(self, inputs):
        self.result = K.sigmoid(K.dot(inputs[0], K.dot(self.kernel, K.transpose(inputs[1]))))
        return self.result

    def compute_output_shape(self, input_shape):
        return K.int_shape(self.result)

MODEL_DIM = 20    
    
# ----- input layers, embedding, LSTM unit, following Lowe et. al. 2015, to encode the context and response
    
context_input = keras.layers.Input(shape=(160,))
context_encoding = keras.layers.Embedding(vocab_size, MODEL_DIM, input_length=160)(context_input)
context_encoding = keras.layers.LSTM(MODEL_DIM)(context_encoding)

response_input = keras.layers.Input(shape=(160,))
response_encoding = keras.layers.Embedding(vocab_size, MODEL_DIM, input_length=160)(response_input)
response_encoding = keras.layers.LSTM(MODEL_DIM)(response_encoding)

# ----- our customer similarity layer, to compute the inner product of predicted and real response

predicted_similarity = SimilarityLayer(kernel_dim=MODEL_DIM)([response_encoding, context_encoding])

# ----- declare inputs and outputs in dual, "siamese" LSTM model

dual_lstm_model = keras.models.Model(inputs=[context_input, response_input], outputs=[predicted_similarity])

# ----- compile model

dual_lstm_model.compile(optimizer=keras.optimizers.Adam(lr=1e-2), loss='binary_crossentropy')

In [None]:
# ----- fit model

dual_lstm_model_history = dual_lstm_model.fit( 
    batch_size=128,
    x=[train_data_context, train_data_response], 
    y=train_data_labels, 
    validation_data=([test_data_context, test_data_response], test_data_labels), 
    epochs=100
)

In [None]:
# Example 2: Bidrectional LSTM + Attention Decoder

# ----- custom attention layer from https://github.com/datalogue/keras-attention.git

import sys
sys.path.append('keras-attention')
from models.custom_recurrents import AttentionDecoder
from keras.layers.wrappers import Bidirectional


In [None]:
# ---- input and Bidirectional Encoding

MODEL_DIM = 20

context_input = keras.layers.Input(shape=(160,))
context_encoding = keras.layers.Embedding(vocab_size, MODEL_DIM, input_length=160)(context_input)
context_encoding = Bidirectional(
    keras.layers.LSTM(MODEL_DIM, return_sequences=True), merge_mode='concat'
)(context_encoding)
context_decoding = AttentionDecoder(MODEL_DIM, vocab_size)(context_encoding)

# ----- declare model

bidirectional_attention_model = keras.models.Model(inputs=context_input, outputs=context_decoding)

# ----- compile model 

bidirectional_attention_model.compile(optimizer=keras.optimizers.Adam(lr=1e-3), loss='categorical_crossentropy')

In [None]:
# ----- fit model

from keras.utils import np_utils

bidirectional_attention_model_history  = bidirectional_attention_model.fit( 
    batch_size=128,
    x=train_data_context, 
    y=np_utils.to_categorical(train_data_response, num_classes=vocab_size), 
    validation_data=(test_data_context, np_utils.to_categorical(test_data_response, num_classes=vocab_size)), 
    epochs=100
)