In [1]:
import tensorflow as tf
from tensorflow.keras import Sequential,Model,layers,utils
import os
import re
import tensorflow_datasets as tfds
import numpy as np

In [2]:
path = utils.get_file('Cornell_Movie-Dialogs_Corpus.zip',origin= 'https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.zip',extract = True)
data_path = os.path.join(os.path.dirname(path),'cornell movie-dialogs corpus')

In [3]:
# function to remove unnecessary characters, and to put a space between each punctuation and word

def preprocess(text):
    text = text.lower().strip()
    text = re.sub(r"([,.?!])",r" \1 ",text)
    text = re.sub(r"[' ']+",r" ",text)
    text = re.sub(r"[^,.?!a-zA-Z]",r" ",text)
    text = text.strip()
    return text

In [4]:
with open(os.path.join(data_path,'movie_lines.txt'),errors = 'ignore') as file:
    lines = file.readlines()
for i in range(3):
    print(lines[i])
    
with open(os.path.join(data_path,'movie_conversations.txt')) as file:
    lines = file.readlines()
for i in range(3):
    print(lines[i])

L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!

L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!

L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']

u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', 'L201', 'L202', 'L203']



In [5]:
# load the questions and answers into staggered sets of lists/sequences
# the answer to the previous question is the the question to the next sequence

def load_lines():
    
    with open(os.path.join(data_path,'movie_lines.txt'),errors = 'ignore') as file:
        lines = file.readlines()
        
    IDs = {}
    for line in lines:
        lineID = line.split(' +++$+++ ')[0]
        line = line.split(' +++$+++ ')[-1]
        IDs[lineID] = line
        
    with open(os.path.join(data_path,'movie_conversations.txt')) as file:
        lines = file.readlines()

    questions,answers = [],[]
    for line in lines:
        line_ids = line.split(' +++$+++ ')[-1]
        line_ids = re.findall(r"'(L\d+)'",line_ids)
        
        for i in range(len(line_ids)-1):
            questions.append(preprocess(IDs[line_ids[i]]))
            answers.append(preprocess(IDs[line_ids[i+1]]))
    
    return questions,answers

In [6]:
questions,answers = load_lines()

In [7]:
sample = np.random.choice(len(questions))
print(questions[sample])
print(answers[sample])

i like you , joey . you ask all the right questions . there is something we can do but it will require great courage .
i don t know . . .


In [8]:
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(questions+answers, target_vocab_size = 2**13)
start_token,end_token = [tokenizer.vocab_size],[tokenizer.vocab_size+1]
VocabSize = tokenizer.vocab_size+2

In [9]:
def tokenize(questions,answers):
    tokenized_questions,tokenized_answers = [],[]
    
    global max_seq_len
    max_seq_len = 40
    
    for (q,a) in zip(questions,answers):
        encoded_q = tokenizer.encode(q)
        encoded_a = tokenizer.encode(a)
        if len(encoded_q)+2<=max_seq_len and len(encoded_a)+2<=max_seq_len:
        
            tokenized_questions.append(start_token + encoded_q + end_token)
            tokenized_answers.append(start_token + encoded_a + end_token)
        
    tokenized_questions = tf.keras.preprocessing.sequence.pad_sequences(tokenized_questions,maxlen=max_seq_len,padding = 'post')
    tokenized_answers = tf.keras.preprocessing.sequence.pad_sequences(tokenized_answers,maxlen=max_seq_len,padding = 'post')
    
    return tokenized_questions,tokenized_answers

In [10]:
tokenized_questions,tokenized_answers = tokenize(questions,answers)

In [11]:
buffer = len(tokenized_questions)
print(buffer)
print(VocabSize)
print(max_seq_len)

193686
8158
40


In [12]:
buffersize = 20000
batchsize = 64

dataset = tf.data.Dataset.from_tensor_slices((tokenized_questions,tokenized_answers))

dataset = dataset.cache()
dataset = dataset.shuffle(buffersize)
dataset = dataset.batch(batchsize,drop_remainder = True)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [13]:
class AttentionLayer(layers.Layer):
    
    def __init__(self,d,n_heads):
        super(AttentionLayer,self).__init__()
        
        self.ValueLayer = layers.Dense(d)
        self.KeyLayer = layers.Dense(d)
        self.QueryLayer = layers.Dense(d)
        self.d = d
        self.n = n_heads
        self.layer_output = layers.Dense(d)
        
        assert d%n_heads == 0
        self.depth = d//n_heads
        
    def multiheads(self,input_tensor):
        
        # input dim = (batchsize,seq_len,d)
        multiheaded_tensor = tf.reshape(input_tensor,shape=(batchsize,-1,self.n,self.depth))
        multiheaded_tensor = tf.transpose(multiheaded_tensor,perm = [0,2,1,3])
        # output dim = (batch_size,n_heads,seq_len,depth)
        
        return multiheaded_tensor
                
    def attention(self,V,K,Q,mask):
        
        # input dim = (batchsize,n_heads,seq_len,depth)
        QdotK = tf.matmul(Q,K,transpose_b=True)*(self.depth)**-0.5
        # QdotK dim = (batchsize,n_heads,seq_lenQ,seq_lenk)
        
        if not (mask==None):
            QdotK += mask*-1e9
        
        attention_weights = tf.nn.softmax(QdotK,axis = -1)
        
        context_vector = tf.matmul(attention_weights,V)
        # output dim = (batchsize,n_heads,seq_lenQ,depth)
        
        return context_vector
    
    def call(self,value,key,query,mask):
        
        V = self.multiheads(self.ValueLayer(value))
        K = self.multiheads(self.KeyLayer(key))
        Q = self.multiheads(self.QueryLayer(query))
        
        context_vector = self.attention(V,K,Q,mask)
        context_vector = tf.transpose(context_vector,perm = [0,2,1,3])
        context_vector = tf.reshape(context_vector,shape = (batchsize,-1,self.d))
        # context vector/output dims = (batchsize,seq_lenQ,d)
        
        outputs = self.layer_output(context_vector)
        
        return outputs

In [14]:
class EncoderLayer(layers.Layer):
    
    def __init__(self,d,n_heads,dff):
        super(EncoderLayer,self).__init__()
        
        self.attention = AttentionLayer(d,n_heads)
        
        self.feedforward1 = layers.Dense(dff,activation = 'relu')
        self.feedforward2 = layers.Dense(d)

        self.norm1 = layers.LayerNormalization(epsilon = 1e-6)
        self.norm2 = layers.LayerNormalization(epsilon = 1e-6)
        
        self.dropout1 = layers.Dropout(rate = 0.3)
        self.dropout2 = layers.Dropout(rate = 0.3)
        
    def call(self,inputs,mask):
        
        attention_vector = self.attention(inputs,inputs,inputs,mask)
        attention_vector = self.dropout1(attention_vector,training=True)
        output1 = self.norm1(attention_vector+inputs)
        
        output2 = self.feedforward1(output1)
        output2 = self.feedforward2(output2)
        output2 = self.dropout2(output2,training = True)
        output2 = self.norm2(output2 + output1)
        
        return output2

In [15]:
def positional_encoding(input_tensor):
    
    # input dim = (batchsize,seq_len,d), for each value in d iterate i
    i = np.arange(input_tensor.shape[-1],dtype = 'float32')
    pos = np.arange(input_tensor.shape[1],dtype = 'float32')
    
    # desired encoding dim = (1,seq_len,d)
    angle_rates = 1/(1e4**(2*(i//2)/np.float32(len(i))))
    angles = np.einsum('i,j->ij',pos,angle_rates)
    evens = np.sin(angles[:,0::2])
    odds = np.cos(angles[:,1::2])
    pos_encoding = np.concatenate([evens,odds],axis = 1)
    return input_tensor+pos_encoding

In [16]:
class Encoder(layers.Layer):
    
    def __init__(self,d,n_heads,n_layers,dff):
        super(Encoder,self).__init__()
        
        self.encoder_layers = [EncoderLayer(d,n_heads,dff) for i in range(n_layers)]
        self.embed = layers.Embedding(VocabSize,d)
        self.n_layers=n_layers
        
    def call(self,inputs,mask):
        
        embedded_inputs = self.embed(inputs)
        pos_encoded_inputs = positional_encoding(embedded_inputs)
        enc_output = pos_encoded_inputs
        
        for i in range(self.n_layers):
            enc_output = self.encoder_layers[i](enc_output,mask)
            
        assert enc_output.shape == (batchsize,max_seq_len,d)
        return enc_output

In [17]:
class DecoderLayer(layers.Layer):
    
    def __init__(self,d,n_heads,dff):
        super(DecoderLayer,self).__init__()
        
        self.feedforward1 = layers.Dense(dff,activation = 'relu')
        self.feedforward2 = layers.Dense(d)
        self.norm1 = layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = layers.LayerNormalization(epsilon=1e-6)
        self.norm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(0.3)
        self.dropout2 = layers.Dropout(0.3)
        self.dropout3 = layers.Dropout(0.3)
        self.self_attention = AttentionLayer(d,n_heads)
        self.attention = AttentionLayer(d,n_heads)
        
    def call(self,enc_output,dec_input,forward_mask,padding_mask):
        
        output1 = self.self_attention(dec_input,dec_input,dec_input,forward_mask)
        output1 = self.dropout1(output1,training = True)
        output1 = self.norm1(dec_input+output1)
        
        output2 = self.attention(enc_output,enc_output,output1,padding_mask)
        output2 = self.dropout2(output2,training = True)
        output2 = self.norm2(output1+output2)
        
        output3 = self.feedforward1(output2)
        output3 = self.feedforward2(output3)
        output3 = self.dropout3(output3,training = True)
        output3 = self.norm3(output2+output3)
        
        return output3

In [18]:
class Decoder(layers.Layer):
    
    def __init__(self,d,n_heads,n_layers,dff):
        super(Decoder,self).__init__()
        
        self.embbed = layers.Embedding(VocabSize,d)
        self.decoder_layers = [DecoderLayer(d,n_heads,dff) for i in range(n_layers)]
        self.n_layers = n_layers
        
    def call(self,enc_output,dec_input,forward_mask,padding_mask):
        
        dec_input = self.embbed(dec_input)
        dec_input = positional_encoding(dec_input)
        dec_output = dec_input
        
        for i in range(self.n_layers):
            dec_output = self.decoder_layers[i](enc_output,dec_output,forward_mask,padding_mask)
        
        assert dec_output.shape == (batchsize,max_seq_len-1,d)
        return dec_output

In [19]:
class Transformer(Model):
    
    def __init__(self,d,n_heads,n_layers,dff):
        super(Transformer,self).__init__()
        
        self.encoder = Encoder(d,n_heads,n_layers,dff)
        self.decoder = Decoder(d,n_heads,n_layers,dff)
        self.final_layer = layers.Dense(VocabSize)
        
    def build_mask(self,input_tensor,mask_type):
        
        # input tensor dim = (batchsize,seq_len)
        # desired output dim = (batchsize,1,1,seq_len) since dim QK is (batchsize,n_heads,seq_lenQ,seq_lenK)
        # the masking needs to be with respect to the key
        padding_mask = tf.cast(tf.math.equal(input_tensor,0),'float32')
        padding_mask = padding_mask[:,tf.newaxis,tf.newaxis,:]
        if mask_type == 'padding':
            return padding_mask
            
        # the masking needs to take only the lower triangular 
        # since the sequence in Q_n should only have access to K_n-1,K_n-2...K_1
        # look_forward_mask should be dim = (1,1,seq_lenQ,seq_lenK)
        elif mask_type == 'forward':
            seq_len = tf.cast(input_tensor.shape[-1],'float32')
            forward_mask = tf.cast(1-tf.linalg.band_part(tf.ones((seq_len,seq_len)),-1,0),'float32')
            forward_mask = tf.maximum(forward_mask,padding_mask)
            return forward_mask
            
    def call(self,enc_inputs,dec_inputs):
        
        padding_mask = self.build_mask(enc_inputs,'padding')
        forward_mask = self.build_mask(dec_inputs,'forward')
        
        enc_outputs = self.encoder(enc_inputs,padding_mask)
        dec_outputs = self.decoder(enc_outputs,dec_inputs,forward_mask,padding_mask)
        
        final_output = self.final_layer(dec_outputs)
        
        return final_output
        

In [20]:
n_layers = 6
d = 512
dff = 1024
n_heads = 8

In [21]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    
    def __init__(self,d,warmup = 4000):
        super(CustomSchedule,self).__init__()
        
        self.d = tf.cast(d,'float32')
        self.warmup = tf.cast(warmup,'float32')
        
    def __call__(self,step):
        lr = tf.math.minimum(step**-0.5,step*self.warmup**-1.5)*self.d**-0.5
        return lr

In [22]:
learning_rate = CustomSchedule(d)

In [23]:
optimizer = tf.keras.optimizers.Adam(learning_rate,beta_1=0.9,beta_2=0.98,epsilon=1e-9)
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

In [24]:
loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction='none')

def compute_loss(dec_targets,dec_out):
    mask = tf.math.logical_not(tf.math.equal(dec_targets,0))
    loss = loss_function(dec_targets,dec_out)
    
    mask = tf.cast(mask,'float32')
    loss*=mask
    
    return tf.reduce_mean(loss)
    

In [25]:
transformer = Transformer(d, n_heads,n_layers, dff)

In [26]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

In [27]:
@tf.function
def train_step(encoder_inputs,decoder_inputs):
    with tf.GradientTape() as tape:
        predictions = transformer(encoder_inputs,decoder_inputs[:,:-1])
        
        loss = compute_loss(decoder_inputs[:,1:],predictions)
        
    gradient = tape.gradient(loss,transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradient,transformer.trainable_variables))
    
    train_loss(loss)
    train_accuracy(decoder_inputs[:,1:], predictions)

In [28]:
import time

In [30]:
for epoch in range(4):
    train_loss.reset_states()
    train_accuracy.reset_states()
    for (batch,(encoder_inputs,decoder_inputs)) in enumerate(dataset):
        start = time.time()
        train_step(encoder_inputs,decoder_inputs)
        
        if (batch+1) % 100 == 0:
            print ('Epoch {} Batch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, batch, train_loss.result(), train_accuracy.result()))
            print ('Time taken for batches: {} secs\n'.format(time.time() - start))
    print ('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(
              epoch + 1, train_loss.result(), train_accuracy.result()))
    

Epoch 1 Batch 99 Loss 1.9894 Accuracy 0.0305
Time taken for batches: 2.3985860347747803 secs

Epoch 1 Batch 199 Loss 1.9758 Accuracy 0.0305
Time taken for batches: 2.4045698642730713 secs

Epoch 1 Batch 299 Loss 1.9683 Accuracy 0.0305
Time taken for batches: 2.4364845752716064 secs

Epoch 1 Batch 399 Loss 1.9627 Accuracy 0.0305
Time taken for batches: 2.374650001525879 secs

Epoch 1 Batch 499 Loss 1.9633 Accuracy 0.0305
Time taken for batches: 2.38661789894104 secs

Epoch 1 Batch 599 Loss 1.9597 Accuracy 0.0306
Time taken for batches: 2.407562732696533 secs

Epoch 1 Batch 699 Loss 1.9601 Accuracy 0.0306
Time taken for batches: 2.415541172027588 secs

Epoch 1 Batch 799 Loss 1.9655 Accuracy 0.0306
Time taken for batches: 2.393599271774292 secs

Epoch 1 Batch 899 Loss 1.9669 Accuracy 0.0306
Time taken for batches: 2.4255142211914062 secs

Epoch 1 Batch 999 Loss 1.9635 Accuracy 0.0306
Time taken for batches: 2.42750883102417 secs

Epoch 1 Batch 1099 Loss 1.9648 Accuracy 0.0306
Time taken f

KeyboardInterrupt: 