In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=False)

In [None]:
! wget https://raw.githubusercontent.com/hse-aml/natural-language-processing/master/setup_google_colab.py -O setup_google_colab.py
import setup_google_colab
setup_google_colab.setup_honor()

In [None]:
! sh download_cornell.sh

In [None]:
import os

import numpy as np

from math import ceil
from random import shuffle

import re
import string

In [None]:
from sklearn.model_selection import train_test_split

In [8]:
import keras
from keras.models import Model, Sequential
from keras.layers import *
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.constraints import max_norm

import keras.backend as K

Using TensorFlow backend.


# Downoad and prepare data

In [None]:
MAX_LEN = 32

In [10]:
from datasets import *

def extractText(line, fast_preprocessing=True):
    tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
    
    line = tag_re.sub('', line)
    line = line.replace('\n', '')
    line = re.sub('\s+', ' ', line)
    line = line.strip()
    
    return line

def splitConversations(conversations, max_len=20, fast_preprocessing=True):
    data = []
    for i, conversation in enumerate(tqdm(conversations)):
        lines = conversation['lines']
        for i in range(len(lines) - 1):
            request = extractText(lines[i]['text'])
            reply = extractText(lines[i + 1]['text'])
            if 0 < len(request) <= max_len and 0 < len(reply) <= max_len:
                data += [(request, reply)]
    return data


def readCornellData(path, max_len=20, fast_preprocessing=True):
    dataset = CornellData(path)
    conversations = dataset.getConversations()
    return splitConversations(conversations, max_len=max_len, fast_preprocessing=fast_preprocessing)

data = readCornellData('data/cornell', max_len=MAX_LEN)

100%|██████████| 83097/83097 [00:02<00:00, 28369.76it/s]


In [None]:
ALPHABET = set()
for c, r in data:
    ALPHABET.update(c)
    ALPHABET.update(r)
  
ALPHABET = sorted(ALPHABET)

START_SYMBOL = 'START'
END_SYMBOL = 'END'
PAD_SYMBOL = 'PAD'
SPECIAL_CHARACHTERS = [PAD_SYMBOL, START_SYMBOL, END_SYMBOL]

char2id = {c:i for i, c in enumerate(SPECIAL_CHARACHTERS + ALPHABET)}
id2char = {i:c for i, c in enumerate(SPECIAL_CHARACHTERS + ALPHABET)}

In [12]:
print('Volume of data:', len(data))

Volume of data: 51539


In [None]:
# sentences

# Utils

In [None]:
def text2seq(text, char2id, max_len):
    start = [char2id[START_SYMBOL]]
    chars_ids = [char2id[text[i]] for i in range(min(max_len - 2, len(text)))]
    end = [char2id[END_SYMBOL]]
    padding = [char2id[PAD_SYMBOL]] * max(0, max_len - len(text) - 2) 

    return start + chars_ids + end + padding

def seq2text(seq, id2char, remove_special=True):
    text = ''.join(map(id2char.get, seq))

    if remove_special:
        for spc in SPECIAL_CHARACHTERS:
            text = text.replace(spc, ' ')

    text = re.sub(r'\s+', ' ', text).strip()
  
    return text

In [None]:
text = 'Wubba lubba dub-dub!'
assert(seq2text(text2seq(text, char2id, MAX_LEN), id2char) == text)
assert(seq2text(text2seq(text, char2id, len(text)), id2char) == text[:-2])

In [None]:
def baseline_generator(data, batch_size):
    n_steps = ceil(len(data) / batch_size)
    while True:
        shuffle(data)

        for i in range(n_steps):
            contexts, repsponses = zip(*data[i*batch_size:(i+1)*batch_size])

            c_batch = np.array([text2seq(c, char2id, MAX_LEN) for c in contexts])
            c_batch_shifted = np.expand_dims(np.hstack([c_batch[:, 1:], np.full((c_batch.shape[0], 1), char2id[PAD_SYMBOL])]), axis=-1) 

            r_batch = np.array([text2seq(r, char2id, MAX_LEN) for r in repsponses])
            r_batch_shifted = np.expand_dims(np.hstack([r_batch[:, 1:], np.full((r_batch.shape[0], 1), char2id[PAD_SYMBOL])]), axis=-1)

            yield ([c_batch, r_batch], [c_batch_shifted, r_batch_shifted])

In [None]:
# batch_size_ = 128
# (context, response), (context_shifted, response_shifted) = next(baseline_generator(data, batch_size_))

In [None]:
def get_masked_loss(mask_value):
    mask_value = K.variable(mask_value)
    def masked_categorical_crossentropy(y_true, y_pred):
        mask = K.all(K.equal(y_true, mask_value), axis=-1)
        mask = 1 - K.cast(mask, K.floatx())
        loss = K.sparse_categorical_crossentropy(y_true, y_pred) * mask
        return K.sum(loss) / K.sum(mask)
    return masked_categorical_crossentropy

In [None]:
def GCA_response(encoder, decoder, context, max_steps=MAX_LEN):
  
    rnn_state = [np.zeros((1, LATENT_DIM))] * len(encoder.outputs)

    context = np.array(text2seq(context, char2id, MAX_LEN)).reshape(1, -1)
    rnn_state = encoder.predict([context] + rnn_state)
    if not isinstance(rnn_state, list):
        rnn_state = [rnn_state]
    
    utterance_partial = np.full((1, MAX_LEN), char2id[PAD_SYMBOL])
    utterance_partial[0, 0] = char2id[START_SYMBOL]
    
    utterance = []
    
    for i in range(1, min(max_steps, MAX_LEN)):
        output_tokens, *rnn_state = decoder_model.predict([utterance_partial] + rnn_state)
        
        sampled_token_index = np.argmax(output_tokens[0, 0])
        if sampled_token_index == char2id[END_SYMBOL]: break
          
        utterance.append(sampled_token_index)
        utterance_partial[0, 0] = sampled_token_index
            
    text = seq2text(utterance, id2char, remove_special=False)

    return text

# Baseline

In [None]:
BASELINE_WEIGHTS_PATH = '/content/gdrive/My Drive/Colab Notebooks/coursera/advanced_machine_learning/nlp/baseline_GCA_char_cornell.h5py'

LATENT_DIM = 256 + 128
BATCH_SIZE = 1024
VOCAB_SIZE = len(char2id)
EMBEDDINGS_DIM = 16

Shared layers

In [None]:
inp_context = Input(shape=(MAX_LEN,), dtype='int32', name='input_context')
inp_reply = Input(shape=(MAX_LEN,), dtype='int32', name='input_utterance')

embeddings = Embedding(output_dim=EMBEDDINGS_DIM, 
                       input_dim=VOCAB_SIZE, 
                       mask_zero=True, 
                       name='char_embeddings')

encoder_input = embeddings(inp_context)
decoder_input = embeddings(inp_reply)

Bot layers

In [None]:
rnn_layer = GRU(LATENT_DIM, 
                  return_sequences=True,
                  return_state=True,
                  name='sequence_modeller')

output_dense = Dense(VOCAB_SIZE, activation='softmax', name='output')

encoder_outputs, *encoder_states = rnn_layer(encoder_input)
encoder_outputs = output_dense(encoder_outputs)
encoder_outputs = Lambda(lambda x: x, name='context_modelling')(encoder_outputs)

decoder_outputs, *decoder_states = rnn_layer(decoder_input, initial_state=encoder_states)
decoder_outputs = output_dense(decoder_outputs)
decoder_outputs = Lambda(lambda x: x, name='reply_modelling')(decoder_outputs)

In [23]:
bot_model = Model([inp_context, 
                   inp_reply], 
                  [encoder_outputs, 
                   decoder_outputs], name='generative_conversational_agent')
bot_model.compile(optimizer=Adam(decay=1e-6, clipnorm=1.), 
                  loss=get_masked_loss(char2id[PAD_SYMBOL]))
bot_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_context (InputLayer)      (None, 32)           0                                            
__________________________________________________________________________________________________
char_embeddings (Embedding)     (None, 32, 16)       1776        input_context[0][0]              
                                                                 input_utterance[0][0]            
__________________________________________________________________________________________________
input_utterance (InputLayer)    (None, 32)           0                                            
__________________________________________________________________________________________________
sequence_modeller (GRU)         [(None, 32, 384), (N 461952      char_embeddings[0][0]            
          

In [None]:
if isinstance(rnn_layer, LSTM):
    encoder_state_input_h = Input(shape=(LATENT_DIM,))
    encoder_state_input_c = Input(shape=(LATENT_DIM,))
    encoder_states_inputs = [encoder_state_input_h, encoder_state_input_c]

    _, *encoder_states = rnn_layer(encoder_input, initial_state=encoder_states_inputs)
    # encoder_states = [state_h, state_c]

    encoder_model = Model([inp_context] + encoder_states_inputs, encoder_states, name='bot_encoder')
  
elif isinstance(rnn_layer, GRU):
    encoder_state_input = Input(shape=(LATENT_DIM,))
    _, encoder_state = rnn_layer(encoder_input, initial_state=[encoder_state_input])

    encoder_model = Model([inp_context, encoder_state_input], encoder_state, name='bot_encoder')
  
else:
    raise NotImplementedError("Use LSTM or GRU.")

In [None]:
if isinstance(rnn_layer, LSTM):
    decoder_state_input_h = Input(shape=(LATENT_DIM,))
    decoder_state_input_c = Input(shape=(LATENT_DIM,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

    decoder_outputs, *decoder_states = rnn_layer(decoder_input, initial_state=decoder_states_inputs)
    decoder_outputs = output_dense(decoder_outputs)

    decoder_model = Model([inp_reply] + decoder_states_inputs, [decoder_outputs] + decoder_states, name='bot_decoder')
  
elif isinstance(rnn_layer, GRU):
    decoder_state_input = Input(shape=(LATENT_DIM,))

    decoder_outputs, decoder_state = rnn_layer(decoder_input, initial_state=[decoder_state_input])
    decoder_outputs = output_dense(decoder_outputs)

    decoder_model = Model([inp_reply, decoder_state_input], [decoder_outputs, decoder_state], name='bot_decoder')
  
else:
    raise NotImplementedError("Use LSTM or GRU.")

In [None]:
if os.path.isfile(BASELINE_WEIGHTS_PATH):
    bot_model.load_weights(BASELINE_WEIGHTS_PATH)

Training

In [None]:
class SamplingCallback(keras.callbacks.Callback):
    def __init__(self, data, n_samples, name=''):
        self.data = data
        self.n_samples = n_samples
        self.name = name
      
    @staticmethod
    def print_sample(encoder, decoder, data, max_steps=MAX_LEN, decode_fn=GCA_response):
        context, response = data[np.random.randint(len(data))]
        pred_text = decode_fn(encoder_model, 
                              decoder_model, 
                              context, 
                              max_steps=max_steps)

        print('CONTEXT:', context)
        print('GT:', response)
        print('PRED:', pred_text)

    def on_epoch_end(self, epoch, logs={}):
        print(self.name)
        for i in range(self.n_samples):
            self.__class__.print_sample(encoder_model, decoder_model, self.data)
            print()

In [28]:
SamplingCallback.print_sample(encoder_model, decoder_model, data)

CONTEXT: Who's that?
GT: It's me - Lothar. Are you okay?
PRED: What do you mean?


In [None]:
data_generator = baseline_generator(data, BATCH_SIZE)

In [None]:
bot_model.fit_generator(data_generator, steps_per_epoch=10 * len(data) // BATCH_SIZE,
                           epochs=128, verbose=1, 
                           callbacks=[SamplingCallback(data, 3, 'Sample check'), 
                                      ModelCheckpoint(BASELINE_WEIGHTS_PATH, monitor='loss', save_best_only=True, save_weights_only=True)])

# Inference test

In [None]:
if os.path.isfile(BASELINE_WEIGHTS_PATH):
    bot_model.load_weights(BASELINE_WEIGHTS_PATH)

In [39]:
context = ''
while context != 'exit':
    context = input('You:')
    response = GCA_response(encoder_model, decoder_model, context)
    print('Bot:', response)

You:Hello!
Bot: Hello, sweetheart.
You:How are you?
Bot: Fine. You like it.
You:What is your name?
Bot: My name is Miles.
You:How old are you?
Bot: Fine.
You:How tall are you?
Bot: Fine.
You:Do you like me?
Bot: Yes, you do recall.
You:Did you enjoy your training schedule?
Bot: Want to take a dollar wearon?
You:I will consider it as positive answer
Bot: Accident.
You:exit
Bot: Please.
