In [64]:
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Embedding
from keras.preprocessing.sequence import pad_sequences
from collections import Counter
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from keras.activations import softmax
from keras.layers.core import Dense, Activation, RepeatVector, Permute
from keras.layers import Input, Embedding, Multiply, Concatenate, Lambda
from keras.layers.wrappers import TimeDistributed
import tensorflow as tf
import pickle
import os
import unicodedata
import re
from sklearn.model_selection import train_test_split

In [65]:
class Subject:
    
    kid = 0    
    def __init__(self, *args):
        self.id = Subject.kid
        Subject.kid+=1
        self.subject = None
        self.targets = None
        if len(args)==1:
            self.subject = args[0]
            self.targets = []        
        elif len(args)==2:
            self.subject = args[0]
            if isinstance(args[1], str):
                self.targets = [args[1]]
            else:
                self.targets = args[1]
    
    def getID(self):
        '''Returns subject id'''
        return self.id
    
    def getData(self):
        '''Returns subject id, original and target subject lines'''
        return self.id, self.subject, self.targets
    
    def getOrigSubject(self):
        '''Returns subject line'''
        return self.subject
    
    def setOrigSubject(self, s):
        '''Takes a new subject as input and updates the local variable'''
        self.subject = s
    
    def getTargetSubjects(self):
        '''Returns target subject lines'''
        return self.targets
    
    def addTargetSubjects(self, s):
        '''Args: List of target subjects. Adds to target subject lines.'''
        self.targets.extend(s)

class SubjectLines:
    def __init__(self, *args):
        self.subxID = dict()
        self.IDxsub = dict()
        if len(args)==1:
            self.addSubjects(args[0])   
        elif len(args)==2:
            self.addSubTargets(args[0], args[1]) 
    
    def getStats(self):
        '''Returns number of subject lines'''
        return len(list(self.IDxsub))
    
    def getSubject(self, *args):
        '''Returns object instance of Subject class with the given subject id.'''
        if isinstance(args[0], int):
            return self.IDxsub[args[0]]
        else:
            return self.subxID[args[0]]
    
    def getID(self, sub):
        '''Returns id of the given subject.'''
        if sub in self.subxID.keys():
            return self.subxID[sub]
        else:
            print("Not Found")
    
    def addSubjects(self, subjects):
        '''Adds the input subject lines to the class instance variable'''
        for sub in subjects:
            s = Subject(sub)
            self.IDxsub[s.getID()] = s 
            self.subxID[sub] = s.getID()
    
    def addSubTargets(self, subjects, targets):
        '''Adds subjects and their targets'''
        for sub, tar in zip(subjects, targets):
            s = Subject(sub, tar)
            self.IDxsub[s.getID()] = s 
            self.subxID[sub] = s.getID()
    
    def addTarget(self, *args):
        '''
        Args (Multiple Options):
        1. (id, target)
        2. (id, targets)
        3. (sub, target)
        4. (sub, targets)
        '''
        s=None
        if isinstance(args[0], int):
            s = self.IDxsub[args[0]]
        else:
            s = self.subxID[args[0]]
        if isinstance(args[1], str):
            s.addTargetSubjects([args[1]])
        else:            
            s.addTargetSubjects(args[1])
    
    def getAllSubjects(self):
        '''Returns a dictionary of all subjects, with key as subject id and value as the corresponding subject line'''
        return self.IDxsub

In [66]:
class SubjectsDataset:
    def __init__(self):
        self.tokenizer = None

    def unicode_to_ascii(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    ## Step 1 and Step 2 
    def preprocess_sentence(self, w):
        w = self.unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    def create_dataset(self, path, num_examples):
        # path : path to spa-eng.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        data = pickle.load(open(path, "rb" ))
        subs = data.getAllSubjects()
        word_pairs = [[self.preprocess_sentence(sub.getOrigSubject()), self.preprocess_sentence(sub.getTargetSubjects()[0])]  for i, sub in subs.items() if i<num_examples]
        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, encoder_text, decoder_text):
        
        print(len(encoder_text), "Encoder Subject line example: {}".format(encoder_text[0]))
        print(len(decoder_text), "Decoder Subject line example: {}".format(decoder_text[0]))
        tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        tokenizer.fit_on_texts(encoder_text)
        tokenizer.fit_on_texts(decoder_text)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
        decoder_sequences = tokenizer.texts_to_sequences(decoder_text)

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        padded_encoder_seq = tf.keras.preprocessing.sequence.pad_sequences(encoder_sequences, padding='post')
        padded_decoder_seq = tf.keras.preprocessing.sequence.pad_sequences(decoder_sequences, padding='post')
        
        return padded_encoder_seq, padded_decoder_seq, tokenizer

    def load_dataset(self, path, num_examples=None):
        # creating cleaned input, output pairs
        inp, tar = self.create_dataset(path, num_examples)
        input_encoder_tensor, input_decoder_tensor, tokenizer = self.tokenize(inp, tar)

        return input_encoder_tensor, input_decoder_tensor, tokenizer

    def call(self, file_path, num_examples, BUFFER_SIZE, BATCH_SIZE):
        input_tensor, target_tensor, self.tokenizer = self.load_dataset(file_path, num_examples)

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, self.tokenizer

In [67]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64
num_examples = 1000
path = 'data/Client15.p'

dataset_creator = SubjectsDataset()
train_dataset, val_dataset, sub_tokenizer = dataset_creator.call(path, num_examples, BUFFER_SIZE, BATCH_SIZE)

1000 Encoder Subject line example: <start> open asap ! you re receiving your daily dose of deals . you will this we just know it <end>
1000 Decoder Subject line example: <start> we are working on it right now ! . ! ! ! you ll be a customer at one of <end>


In [68]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 25]), TensorShape([64, 107]))

In [69]:
vocab_size = len(sub_tokenizer.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 1024
steps_per_epoch = num_examples//BATCH_SIZE

In [70]:
vocab_size, max_length_input, max_length_output

(4311, 25, 107)

In [72]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')



    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [73]:
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 25, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)


In [74]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        #Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
        
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)
        
        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
        return rnn_cell

    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state

    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
        return outputs

In [75]:
import tensorflow_addons as tfa

In [76]:
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 106, 4311)


In [77]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss

In [78]:
checkpoint_dir = './checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [79]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        print(logits)
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [80]:
import time
EPOCHS = 4

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 25 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
#     if (epoch + 1) % 2 == 0:
#         checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Tensor("decoder_2/basic_decoder_4/decoder/transpose_1:0", shape=(64, None, 4311), dtype=float32)
Tensor("decoder_2/basic_decoder_4/decoder/transpose_1:0", shape=(64, None, 4311), dtype=float32)
Epoch 1 Batch 0 Loss 1.3446
Epoch 1 Loss 1.0138
Time taken for 1 epoch 32.24345684051514 sec

Epoch 2 Batch 0 Loss 1.1667
Epoch 2 Loss 0.8869
Time taken for 1 epoch 29.452654123306274 sec

Epoch 3 Batch 0 Loss 1.1025
Epoch 3 Loss 0.8607
Time taken for 1 epoch 29.591081142425537 sec

Epoch 4 Batch 0 Loss 1.0714
Epoch 4 Loss 0.8544
Time taken for 1 epoch 29.45012092590332 sec



In [81]:
def evaluate_subject(sentence):
    sentence = dataset_creator.preprocess_sentence(sentence)

    inputs = [sub_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], sub_tokenizer.word_index['<start>'])
    end_token = sub_tokenizer.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()

def predict_sub(sentence):
    result = evaluate_subject(sentence)
    print(result)
    result = sub_tokenizer.sequences_to_texts(result)
    print('Input: %s' % (sentence))
    print('Predicted Subject: {}'.format(result))

In [82]:
predict_sub(u'join us for a conversation is a recovery in sight ? lessons from around the world.')

[[6 4 4 4 4 4 4 4 4 4 3]]
Input: join us for a conversation is a recovery in sight ? lessons from around the world.
Predicted Subject: ['! . . . . . . . . . <end>']
