In [37]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from tqdm import tqdm
#import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
import time
import os
import sys
import re

In [38]:
data_path_human = "../data/human_text.txt"
data_path_robot = "../data/robot_text.txt"

with open(data_path_human, "r") as f:
    human_lines = f.read().split("\n")
    
with open(data_path_robot, "r") as f:
    robot_lines = f.read().split("\n")

In [39]:
human_lines = [re.sub(r"\[\w+\]",'hi',line) for line in human_lines]
human_lines = [" ".join(re.findall(r"\w+",line)) for line in human_lines]
robot_lines = [re.sub(r"\[\w+\]",'',line) for line in robot_lines]
robot_lines = [" ".join(re.findall(r"\w+",line)) for line in robot_lines]
# grouping lines by response pair
pairs = list(zip(human_lines,robot_lines))
#random.shuffle(pairs)
len(pairs)

2184

In [40]:
input_docs = []
target_docs = []
input_tokens = set()
target_tokens = set()

for line in pairs:
    input_doc, target_doc = line[0], line[1]
    # Appending each input sentence to input_docs
    input_docs.append(input_doc)
    # Splitting words from punctuation  
    target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
    # Redefine target_doc below and append it to target_docs
    target_doc = '<START> ' + target_doc + ' <END>'
    target_docs.append(target_doc)
  
    # Now we split up each sentence into words and add each unique word to our vocabulary set
    for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
        if token not in input_tokens:
            input_tokens.add(token)
    for token in target_doc.split():
        if token not in target_tokens:
            target_tokens.add(token)
            
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))

num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)

num_tokens = len(set(input_tokens + target_tokens)) + 2 # [UNK]
pairs = list(zip(input_docs, target_docs))

In [41]:
# replace out-of-vocabulary words with "<unk>" token
input_tokenizer = Tokenizer(filters='', oov_token="<unk>")
target_tokenizer = Tokenizer(filters='', oov_token="<unk>")

# create internal vocabulary
input_tokenizer.fit_on_texts(input_docs)
target_tokenizer.fit_on_texts(target_docs)

# creates sequences of integers out of the given input texts
X = input_tokenizer.texts_to_sequences(input_docs)
Y = target_tokenizer.texts_to_sequences(target_docs)

X = tf.keras.preprocessing.sequence.pad_sequences(X, padding='post')
Y = tf.keras.preprocessing.sequence.pad_sequences(Y, padding='post')

In [42]:
print(X.shape)
print(Y.shape)

(2184, 258)
(2184, 149)


In [43]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1)

BATCH_SIZE = 64
BUFFER_SIZE = len(X_train)
steps_per_epoch = BUFFER_SIZE // BATCH_SIZE

In [44]:
def max_len(sentence):
    return max(len(s) for s in sentence)

max_length_input = max_len(X)
max_length_output = max_len(Y)

input_vocab_size = len(input_tokenizer.word_index) + 1  
target_vocab_size = len(target_tokenizer.word_index) + 1

In [45]:
dataset = tf.data.Dataset.from_tensor_slices((X_train, Y_train)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE, 
                                                                                            drop_remainder=True) 

## Transformer Model

Pointwise FFN

In [46]:
def pointwise_ffn(embedding_dims, expanded_dims):
    return tf.keras.Sequential([
           tf.keras.layers.Dense(expanded_dims, activation='relu'),  
           tf.keras.layers.Dense(embedding_dims)  
    ])

Masking

In [47]:
# padding tokens are explicitly ignored by masking 
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

    return seq[:, tf.newaxis, tf.newaxis, :] 

In [48]:
x = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
x.shape[-1]

5

In [49]:
# in the decoder we want to mask out all of the future tokens
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask 

In [50]:
def create_masks(input_data, target):
    # Encoder padding mask
    encoder_padding_mask = create_padding_mask(input_data)
  
    # Used in the 2nd attention block in the decoder.
    # This padding mask is used to mask the encoder outputs.
    decoder_padding_mask = create_padding_mask(input_data)
  
    # Used in the 1st attention block in the decoder.
    # It is used to pad and mask future tokens in the input received by 
    # the decoder.
    look_ahead_mask = create_look_ahead_mask(tf.shape(target)[1])
    decoder_target_padding_mask = create_padding_mask(target)
    combined_mask = tf.maximum(decoder_target_padding_mask, look_ahead_mask)
  
    return encoder_padding_mask, decoder_padding_mask, combined_mask

Self-Attention

In [51]:
def calculate_self_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)  
    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    
    # scaling it by the sqrt of the last dimension of k
    scaled_scores = matmul_qk / tf.math.sqrt(dk)

    # add the mask to the scaled tensor.
    if mask is not None:
        scaled_scores += (mask * -1e9)  

    # softmax is normalized on the last axis (len_k) so that the scores
    # add up to 1.
    attention_weights = tf.nn.softmax(scaled_scores, axis=-1)  

    output = tf.matmul(attention_weights, v)  

    return output

Multi-Head Attention

In [52]:
num_heads = 8
embedding_dims = 128

In [53]:
class MultiHeadAttention(tf.keras.layers.Layer):
    
    def __init__(self, embedding_dims, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.embedding_dims = embedding_dims
        self.depth = embedding_dims // num_heads
    
        self.wq = tf.keras.layers.Dense(embedding_dims)
        self.wk = tf.keras.layers.Dense(embedding_dims)
        self.wv = tf.keras.layers.Dense(embedding_dims)
    
        self.dense = tf.keras.layers.Dense(embedding_dims)
        
    def split_heads(self, x, batch_size):
        """Split the last dimension into (num_heads, depth).
        Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
    
        q = self.wq(q)  # (batch_size, seq_len, embedding_dims)
        k = self.wk(k)  
        v = self.wv(v)  
    
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len, depth)
        k = self.split_heads(k, batch_size)  
        v = self.split_heads(v, batch_size)  
    
        # self_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        self_attention = calculate_self_attention(q, k, v, mask)
    
        self_attention = tf.transpose(self_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len, num_heads, depth)

        concat_attention = tf.reshape(self_attention, 
                                     (batch_size, -1, self.embedding_dims))  # (batch_size, seq_len, embedding_dims)

        output = self.dense(concat_attention)
        
        return output

Encoder Layer

In [54]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, embedding_dims, num_heads, expanding_dims, rate=0.1):
    super().__init__()

    self.mha = MultiHeadAttention(embedding_dims, num_heads)
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.ffn = pointwise_ffn(embedding_dims, expanding_dims)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x, training, padding_mask):

    attn_output = self.mha(x, x, x, padding_mask)  
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  
    
    ffn_output = self.ffn(out1) 
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)   # (batch_size, input_seq_len, embedding_dims)
    
    return out2

Decoder Layer

In [55]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, embedding_dims, num_heads, expanded_dims, rate=0.1):
        super().__init__()

        self.mha1 = MultiHeadAttention(embedding_dims, num_heads)
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.mha2 = MultiHeadAttention(embedding_dims, num_heads)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.ffn = pointwise_ffn(embedding_dims, expanded_dims)
        self.dropout3 = tf.keras.layers.Dropout(rate)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6) 
    
    def call(self, x, encoder_output, training, padding_mask, look_ahead_mask):

        attn1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2 = self.mha2( encoder_output, encoder_output, out1, padding_mask)  
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)  

        ffn_output = self.ffn(out2)  
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, embedding_dims)

        return out3

Positional Encoding

In [56]:
def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

In [57]:
def positional_encoding(position, dimensions):
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(dimensions)[np.newaxis, :],
                            dimensions)
  
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
  
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

Encoder

In [58]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, embedding_dims, num_heads, expanded_dims, input_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super().__init__()

        self.embedding_dims = embedding_dims
        self.num_layers = num_layers
    
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, embedding_dims)
        self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                                embedding_dims)
    
        self.dropout = tf.keras.layers.Dropout(rate)

        self.encoder_layers = [EncoderLayer(embedding_dims, num_heads, expanded_dims, rate) 
                              for i in range(num_layers)]
  
    
        
    def call(self, x, training, padding_mask):

        seq_len = tf.shape(x)[1]

        x = self.embedding(x)  
        x *= tf.math.sqrt(tf.cast(self.embedding_dims, tf.float32)) # Technicality that  is used in the original paper
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.encoder_layers[i](x, training, padding_mask)
    
        return x  # (batch_size, input_seq_len, embedding_dims)

Decoder

In [59]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, embedding_dims, num_heads, expanded_dims, target_vocab_size,
                 maximum_position_encoding, rate=0.1):
        super().__init__()

        self.embedding_dims = embedding_dims
        self.num_layers = num_layers
    
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, embedding_dims)
        self.pos_encoding = positional_encoding(maximum_position_encoding, embedding_dims)
    
        self.decoder_layers = [DecoderLayer(embedding_dims, num_heads, expanded_dims, rate) 
                       for i in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, encoder_output, training, 
             padding_mask, look_ahead_mask):

        seq_len = tf.shape(x)[1]
    
        x = self.embedding(x)  
        x *= tf.math.sqrt(tf.cast(self.embedding_dims, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]
    
        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.decoder_layers[i](x, encoder_output, training,
                                       padding_mask, look_ahead_mask)
  
        return x  # (batch_size, target_seq_len, d_model)

Transformer Model

In [60]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, embedding_dims, num_heads, expanded_dims, input_vocab_size, 
                 target_vocab_size, pe_input, pe_target, rate=0.1):
        super().__init__()

        self.encoder = Encoder(num_layers, embedding_dims, num_heads, expanded_dims, 
                               input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, embedding_dims, num_heads, expanded_dims, 
                               target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, input, target, training, encoder_padding_mask, 
             decoder_padding_mask, look_ahead_mask):

        encoder_output = self.encoder(input, training, encoder_padding_mask)  # (batch_size, inp_seq_len, embedding_dims)
    
    
        dec_output = self.decoder(target, encoder_output, training, decoder_padding_mask, look_ahead_mask)
        # (batch_size, target_seq_len, embedding_dims)

        final_output = self.final_layer(dec_output)  # (batch_size, target_seq_len, target_vocab_size)
    
        return final_output

In [61]:
num_layers = 4
expanded_dims = 512

transformer = Transformer(num_layers, embedding_dims, num_heads, expanded_dims,
                          input_vocab_size, target_vocab_size, 
                          pe_input=input_vocab_size, 
                          pe_target=target_vocab_size)

In [62]:
def loss_function(real, pred):
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    mask = tf.cast(mask, dtype=loss.dtype)
    loss = loss * mask
    loss = tf.reduce_mean(loss)
    return loss

In [63]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embedding_dims, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
    
        self.embedding_dims = embedding_dims
        self.embedding_dims = tf.cast(self.embedding_dims, tf.float32)

        self.warmup_steps = warmup_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
    
        return tf.math.rsqrt(self.embedding_dims) * tf.math.minimum(arg1, arg2)

In [64]:
learning_rate = CustomSchedule(embedding_dims)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, 
                                     epsilon=1e-9)

In [65]:
EPOCHS = 1

for epoch in range(EPOCHS):
    start = time.time()
    epoch_loss = 0

    for (batch, (input, target)) in enumerate(dataset.take(steps_per_epoch)):
        decoder_input = target[ : , :-1 ] # ignore <end> token
        real = target[ : , 1: ]           # ignore <start> token
    
        enc_padding_mask, dec_padding_mask, combined_mask = create_masks(input, decoder_input)
    
        with tf.GradientTape() as tape:
            predictions = transformer(input=input, target=decoder_input, 
                                      training=True, 
                                      encoder_padding_mask=enc_padding_mask, 
                                      decoder_padding_mask=dec_padding_mask,
                                      look_ahead_mask=combined_mask)
            batch_loss = loss_function(real, predictions)


        gradients = tape.gradient(batch_loss, transformer.trainable_variables)    
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        epoch_loss += batch_loss  

        if batch % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(
                   epoch + 1, batch, batch_loss.numpy()))
    
    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, 
                                         epoch_loss / steps_per_epoch))

    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.5272
Epoch 1 Loss 0.5017
Time taken for 1 epoch: 319.62280106544495 secs

