In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
import time
import os
import sys
import re

## What we need
* Data
    * Read the raw data
    * Prepare it for processing (put it in couples (enc-dec) and clean the symbols)
* Model
    * Create the tokenizer
    * Create the embedding layer
    * Add the BiLSTM of the encoder
    * Create the attention layer
    * Create the Decoder (attention and LSTM?)
* Create the inference notebook cell

## Read the data

In [2]:
data_path_human = "../data/rDany/human_text.txt"
data_path_robot = "../data/rDany/robot_text.txt"

with open(data_path_human, "r") as f:
    human_lines = f.read().split("\n")
    
with open(data_path_robot, "r") as f:
    robot_lines = f.read().split("\n")
print(human_lines[1])
print(robot_lines[1])

oh, thanks !  i'm fine. this is an evening in my timezone
😄 here is afternoon ! 


In [3]:
human_lines = [re.sub(r"\[\w+\]",'hi',line) for line in human_lines]
human_lines = [" ".join(re.findall(r"\w+",line)) for line in human_lines]
robot_lines = [re.sub(r"\[\w+\]",'',line) for line in robot_lines]
robot_lines = [" ".join(re.findall(r"\w+",line)) for line in robot_lines]
# grouping lines by response pair
pairs = list(zip(human_lines,robot_lines))
#random.shuffle(pairs)
len(pairs)

2363

In [4]:
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs:
    input_doc, target_doc = line[0], line[1]
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    # Splitting words from punctuation  
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Redefine target_doc below and append it to target_docs
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
  
    # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
num_tokens = len(set(input_tokens + target_tokens)) + 2 # [UNK]
pairs = list(zip(input_docs, target_docs))

In [5]:
tokenizer = Tokenizer(filters='', oov_token="<unk>")
tokenizer.fit_on_texts(input_docs + target_docs)

X = tokenizer.texts_to_sequences(input_docs)
Y = tokenizer.texts_to_sequences(target_docs)

X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='pre')
Y = tf.keras.preprocessing.sequence.pad_sequences(Y, padding='pre')


In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

In [7]:
with open("../data/glove.twitter.27B.50d.txt", "r") as f:
    dict_w2v = {}
    problems = []
    
    for line in tqdm(f):
        
        tokens = line.split()
        
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            problems.append({word: vector})

1193514it [00:26, 44773.31it/s]


In [8]:
type(tokenizer.word_index)

dict

In [9]:
num_tokens = len(tokenizer.word_index)
embedding_dim = 50
hits = 0
misses = 0
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for token, _ in tqdm(tokenizer.word_index.items()):
    
    embedding = dict_w2v.get(token)
    
    if embedding is not None:
        embedding_matrix[tokenizer.word_index[token]] = embedding
        hits += 1
    else:
        misses += 1
embedding_matrix[tokenizer.word_index["<unk>"]] = np.random.rand(embedding_dim)
print(hits, misses)

100%|██████████| 5056/5056 [00:00<00:00, 201691.04it/s]

4519 537





In [10]:
print(f"Hits: {hits}")
print(f"Missed: {misses}")

Hits: 4519
Missed: 537


In [11]:
BUFFER_SIZE = len(X_train)
BATCH_SIZE = 64
steps_per_epoch = len(X_train)//BATCH_SIZE
embedding_dim = 50
units = 1024
vocab_size = len(tokenizer.word_index)

In [12]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, 
                                                                                            drop_remainder=True)

In [13]:
def max_len(sentence):
    return max(len(s) for s in sentence)

max_length_input = max_len(X_train)
max_length_output = max_len(Y_train)
vocab_size = len(tokenizer.word_index)

In [14]:
for example in dataset.take(1):
    example_x, example_y = example
    
print(example_x.shape) 
print(example_y.shape) 

(64, 258)
(64, 149)


In [15]:
class EncoderAttention(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dims, hidden_units):
        super().__init__()
        self.hidden_units = hidden_units
        self.embedding_layer = tf.keras.layers.Embedding(vocab_size, embedding_dims, tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)
        self.lstm_layer = tf.keras.layers.LSTM(hidden_units, return_sequences=True, 
                                                     return_state=True ) # We need the lstm outputs 
                                                                         # to calculate attention!
    
    def initialize_hidden_state(self): 
        return [tf.zeros((BATCH_SIZE, self.hidden_units)), 
                tf.zeros((BATCH_SIZE, self.hidden_units))] 
                                                               
    def call(self, inp, hidden_state):
        embedding = self.embedding_layer(inp)
        output, h_state, c_state = self.lstm_layer(embedding, initial_state = hidden_state)
        return output, h_state, c_state


encoder = EncoderAttention(len(tokenizer.word_index), embedding_dim, units)

In [16]:
# Test  the encoder
sample_initial_state = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_x, sample_initial_state)
print(sample_output.shape)
print(sample_h.shape)

(64, 258, 1024)
(64, 1024)


In [17]:

class DecoderAttention(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_units):
        super().__init__()
        
        
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim, tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)

        self.lstm_cell = tf.keras.layers.LSTMCell(hidden_units)

        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        self.attention_mechanism = tfa.seq2seq.LuongAttention(hidden_units, memory_sequence_length=BATCH_SIZE*[len(X_train[0])]) #N

        self.attention_cell = tfa.seq2seq.AttentionWrapper(cell=self.lstm_cell, # N
                                      attention_mechanism=self.attention_mechanism, 
                                      attention_layer_size=hidden_units)

        self.output_layer = tf.keras.layers.Dense(vocab_size)
        self.decoder = tfa.seq2seq.BasicDecoder(self.attention_cell, # N
                                                sampler=self.sampler, 
                                                output_layer=self.output_layer)

    def build_initial_state(self, batch_size, encoder_state): #N
        decoder_initial_state = self.attention_cell.get_initial_state(batch_size=batch_size, dtype=tf.float32)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state


    def call(self, inputs, initial_state):
        embedding = self.embedding(inputs)
        outputs, _, _ = self.decoder(embedding, initial_state=initial_state, sequence_length=BATCH_SIZE*[len(Y_train[0])-1])
        return outputs

decoder = DecoderAttention(len(tokenizer.word_index), embedding_dim, units)

In [18]:
# Test the decoder
sample_y = tf.random.uniform((BATCH_SIZE, len(X_train)))
decoder.attention_mechanism.setup_memory(sample_output) # Attention needs the last output of the Encoder
                                                        # as starting point
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c]) # N


sample_decoder_output = decoder(example_y, initial_state)

print(sample_decoder_output.rnn_output.shape)

(64, 148, 5056)


In [19]:
sample_output.shape

TensorShape([64, 258, 1024])

In [20]:
optimizer = tf.keras.optimizers.Adam()

def loss_function(real, pred):
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  # mask and loss have to have the same Tensor type
    loss = mask * loss
    loss = tf.reduce_mean(loss) # you need one loss scalar number for the mini batch
    return loss 

In [27]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    encoder_hidden = encoder.initialize_hidden_state() # Every epoch we use a zero Tensor matrix
    epoch_loss = 0

    for (batch, (input, target)) in enumerate(dataset.take(steps_per_epoch)):
        with tf.GradientTape() as tape:
            # Pass the input through the encoder 
            encoder_output, encoder_h, encoder_c = encoder(input, encoder_hidden)
            decoder_input = target[ : , :-1 ] # Ignore <end> token
            real = target[ : , 1: ]         # ignore <start> token
            # The encoder output, encoder hidden state and the decoder input
            # is passed to the decoder
            decoder.attention_mechanism.setup_memory(encoder_output) # N
            decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [encoder_h, encoder_c]) # N
            decoder_output = decoder(decoder_input, decoder_initial_state) 
            logits = decoder_output.rnn_output
            batch_loss = loss_function(real, logits)

        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(batch_loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        epoch_loss += batch_loss

        if batch % 10 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      epoch_loss / steps_per_epoch))
    print('Time {:.4f} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.5324
Epoch 1 Batch 10 Loss 0.3682
Epoch 1 Batch 20 Loss 0.3273


InvalidArgumentError: indices[53,141] = 5056 is not in [0, 5056) [Op:ResourceGather]

In [22]:
import unicodedata
def preprocess_sentence(w):
    w = w.lower().strip()
    # This next line is confusing!
    # We normalize unicode data, umlauts will be converted to normal letters
    #w = w.replace("ß", "ss")
    #w = ''.join(c for c in unicodedata.normalize('NFD', w) if unicodedata.category(c) != 'Mn')

    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"\[\w+\]",'', w)
    w = " ".join(re.findall(r"\w+",w))
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)

    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!]+", " ", w)
    w = w.strip()

    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '<start> ' + w + ' <end>'
    return w

In [24]:
def reply(sentence, preprocess=True):
    if preprocess:
        sentence = preprocess_sentence(sentence)
        sentence_tokens = tokenizer.texts_to_sequences([sentence])
        input = tf.keras.preprocessing.sequence.pad_sequences(sentence_tokens, maxlen=max_length_input, padding='post')
    else:
        input = sentence
    input = tf.convert_to_tensor(input)

    encoder_hidden = [tf.zeros((1, units)), tf.zeros((1, units))]
    encoder_output, encoder_h, encoder_c = encoder(input, encoder_hidden)
    start_token = tf.convert_to_tensor([tokenizer.word_index['<start>']])
    end_token = tokenizer.word_index['<end>']

    # This time we use the greedy sampler because we want the word with the highest probability!
    # We are not generating new text, where a probability sampling would be better
    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate a BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.attention_cell, # N
                                                sampler=greedy_sampler, output_layer=decoder.output_layer)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(encoder_output) # N

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(batch_size=1, encoder_state=[encoder_h, encoder_c]) # N

    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_token, end_token= end_token, initial_state=decoder_initial_state)

    result_sequence  = outputs.sample_id.numpy()
    return tokenizer.sequences_to_texts(result_sequence)[0]

reply("Hi")

'<end>'

In [26]:
reply("Whats up!")

'<end>'

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()

In [None]:
print(text_vectorization_layer.get_vocabulary()[:12])
text_vectorization_layer(["<START>"])

In [None]:
decoder.embedding.variables[0].shape