In [7]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
import sys
import re

## What we need
* Data
    * Read the raw data
    * Prepare it for processing (put it in couples (enc-dec) and clean the symbols)
* Model
    * Create the tokenizer
    * Create the embedding layer
    * Add the BiLSTM of the encoder
    * Create the attention layer
    * Create the Decoder (attention and LSTM?)
* Create the inference notebook cell

## Read the data

In [8]:
data_path_human = "../data/rDany/human_text.txt"
data_path_robot = "../data/rDany/robot_text.txt"

with open(data_path_human, "r") as f:
    human_lines = f.read().split("\n")
    
with open(data_path_robot, "r") as f:
    robot_lines = f.read().split("\n")
print(human_lines[1])
print(robot_lines[1])

oh, thanks !  i'm fine. this is an evening in my timezone
😄 here is afternoon ! 


In [9]:
human_lines = [re.sub(r"\[\w+\]",'hi',line) for line in human_lines]
human_lines = [" ".join(re.findall(r"\w+",line)) for line in human_lines]
robot_lines = [re.sub(r"\[\w+\]",'',line) for line in robot_lines]
robot_lines = [" ".join(re.findall(r"\w+",line)) for line in robot_lines]
# grouping lines by response pair
pairs = list(zip(human_lines,robot_lines))
#random.shuffle(pairs)
len(pairs)

2363

# Ignore

In [10]:
import numpy as np

input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs:
    input_doc, target_doc = line[0], line[1]
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    # Splitting words from punctuation  
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Redefine target_doc below and append it to target_docs
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
  
    # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)


# Ignore

In [11]:
input_features_dict = dict(
    [(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
    [(token, i) for i, token in enumerate(target_tokens)])
reverse_input_features_dict = dict(
    (i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
    (i, token) for token, i in target_features_dict.items())

# Ignore

In [12]:
#Maximum length of sentences in input and target documents
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])

In [45]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(input_docs + target_docs)

In [53]:
input_docs_tokens = tokenizer.texts_to_sequences(input_docs)
target_docs_tokens = tokenizer.texts_to_sequences(target_docs)

In [54]:
input_docs_tokens = pad_sequences(input_docs_tokens, maxlen=max_encoder_seq_length, dtype='int32', padding='pre',
    truncating='pre', value=0.0)

target_docs_tokens = pad_sequences(target_docs_tokens, maxlen=max_decoder_seq_length, dtype='int32', padding='pre', 
    truncating='pre', value=0.0)

In [14]:
vocabulary_size = 10000
sentence_size = max(max_encoder_seq_length, max_decoder_seq_length)

text_vectorization_layer = TextVectorization(max_tokens=vocabulary_size, output_sequence_length=sentence_size)
text_vectorization_layer.adapt(human_lines + robot_lines)

In [15]:
with open("../data/glove.twitter.27B.50d.txt", "r") as f:
    dict_w2v = {}
    problems = []
    
    for line in tqdm(f):
        
        tokens = line.split()
        
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            problems.append({word: vector})

1193514it [00:26, 45567.12it/s]


In [32]:
num_tokens = len(text_vectorization_layer.get_vocabulary())
embedding_dim = 50
hits = 0
misses = 0
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for token in tqdm(text_vectorization_layer.get_vocabulary()):
    
    embedding = dict_w2v.get(token)
    
    if embedding is not None:
        embedding_matrix[text_vectorization_layer([token]).numpy()[0, 0]] = embedding
        hits += 1
    else:
        misses += 1
embedding_matrix[text_vectorization_layer(["[UNK]"])[0, 0]] = np.random.rand(embedding_dim)

100%|██████████| 5055/5055 [00:12<00:00, 399.86it/s]


In [33]:
print(f"Hits: {hits}")
print(f"Missed: {misses}")

Hits: 4519
Missed: 536


In [76]:
class Encoder(tf.keras.Model):
    
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz, text_vectorization_layer, embedding_matrix=None):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.text_vectorization_layer = text_vectorization_layer
        
        
        if embedding_matrix is None:
            self.embedding = tf.keras.layers.Embedding(
                vocab_size,
                embedding_dim,
                tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)
        else:
            self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
            
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.text_vectorization_layer(x)
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))


In [77]:
embedding_matrix[text_vectorization_layer(["[UNK]"])[0, 0]].shape

(50,)

In [78]:
BUFFER_SIZE = len(pairs)
BATCH_SIZE = 64
steps_per_epoch = len(pairs)//BATCH_SIZE
embedding_dim = 50
units = 1024
vocab_size = text_vectorization_layer.get_vocabulary()


In [79]:
dataset = tf.data.Dataset.from_tensor_slices(pairs)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [80]:
example_input_batch, example_target_batch = tf.split(next(iter(dataset)), num_or_size_splits=2, axis=1)
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 1]), TensorShape([64, 1]))

In [81]:
encoder = Encoder(len(vocab_size), embedding_dim, units, BATCH_SIZE, embedding_matrix=embedding_matrix, text_vectorization_layer=text_vectorization_layer)

# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 258, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [82]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        # query hidden state shape == (batch_size, hidden size)
        # query_with_time_axis shape == (batch_size, 1, hidden size)
        # values shape == (batch_size, max_len, hidden size)
        # we are doing this to broadcast addition along the time axis to calculate the score
        query_with_time_axis = tf.expand_dims(query, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))

        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [83]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print("Attention result shape: (batch size, units) {}".format(attention_result.shape))
print("Attention weights shape: (batch_size, sequence_length, 1) {}".format(attention_weights.shape))

Attention result shape: (batch size, units) (64, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (64, 258, 1)


In [86]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, text_vectorization_layer, embedding_matrix=None):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.text_vectorization_layer = text_vectorization_layer
        
        
        if embedding_matrix is None:
            self.embedding = tf.keras.layers.Embedding(
                vocab_size,
                embedding_dim,
                tf.keras.initializers.Constant(embedding_matrix),
                trainable=True)
        else:
            self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
            
        
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)

        # used for attention
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab)
        x = self.fc(output)

        return x, state, attention_weights

In [87]:
decoder = Decoder(len(vocab_size), embedding_dim, units, BATCH_SIZE, text_vectorization_layer, embedding_matrix)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)),
                                      sample_hidden, sample_output)

print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))

Decoder output shape: (batch_size, vocab size) (64, 5055)


In [88]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [89]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [123]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)

        dec_hidden = enc_hidden

        #dec_input = tf.expand_dims(text_vectorization_layer(["<start>"])[0, 0] * BATCH_SIZE, 1)

        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # passing enc_output to the decoder
            predictions, dec_hidden, _ = decoder(targ, dec_hidden, enc_output)

            loss += loss_function(targ[:, t], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)

    batch_loss = (loss / int(targ.shape[1]))

    variables = encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, variables)

    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [124]:
import time

In [125]:
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, tensor) in enumerate(dataset.take(steps_per_epoch)):
        inp, targ = tf.split(tensor, num_or_size_splits=2, axis=1)
        
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

TypeError: in user code:

    <ipython-input-123-bebf71d53524>:26 train_step  *
        gradients = tape.gradient(loss, variables)
    /usr/local/lib/python3.8/site-packages/tensorflow/python/eager/backprop.py:1034 gradient  **
        if not backprop_util.IsTrainable(t):
    /usr/local/lib/python3.8/site-packages/tensorflow/python/eager/backprop_util.py:30 IsTrainable
        dtype = dtypes.as_dtype(dtype)
    /usr/local/lib/python3.8/site-packages/tensorflow/python/framework/dtypes.py:649 as_dtype
        raise TypeError("Cannot convert value %r to a TensorFlow DType." %

    TypeError: Cannot convert value 0 to a TensorFlow DType.
