# Subject Line Generation 

## Pacakges

In [2]:
import numpy as np
import pandas as pd
import transformers
# import tensorflow as tf
import os

In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from tqdm import tqdm

In [5]:
import tensorflow

## Load Data 

In [6]:
ROOT = '/home/mluser/users'
data_subjects = os.path.join(ROOT, 'data', 'subject_line')
print(data_subjects)

/home/mluser/users/data/subject_line


In [7]:
result = []
for (r, d, f) in os.walk(data_subjects):
    for i in f:
        df = pd.read_parquet(os.path.join(r, i))
        print(df.shape)
#     for i in f:
#         result.append(pd.read_parquet(os.path.join(r, i)))

(992444, 14)
(971776, 14)
(974093, 14)
(902278, 14)
(909824, 14)
(869376, 14)
(975305, 14)
(977328, 14)
(948383, 14)
(869941, 14)
(983628, 14)
(982978, 14)
(877072, 14)
(877851, 14)
(982772, 14)
(983040, 14)
(981547, 14)
(871029, 14)
(977657, 14)
(987136, 14)
(967458, 14)
(1005099, 14)
(981974, 14)
(943257, 14)
(982790, 14)
(1039360, 14)
(848101, 14)
(863133, 14)
(892504, 14)
(896307, 14)


In [7]:
data = list(set(df['subject'].dropna()))

In [8]:
len(data)

135047

In [9]:
final = []
for d in data:
    k = [l for l in d if l=="{" or l=="}" or l=="[" or l=="]" or l=="(" or l==")"]
    if len(k)<4:
        if "Lewisville" not in d:
            final.append(d.strip().strip(string.punctuation))
data = final

In [10]:
MAX_LENGTH = 20

def filterSubject(p):
    return len(p.split(' ')) < MAX_LENGTH

def filterSubjects(subs):
    return [sub for sub in subs if filterSubject(sub)]

In [11]:
print(len(data))
data = filterSubjects(data)

93660


In [12]:
def normalizeString(s):
    s = s.lower().strip()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [13]:
data = [normalizeString(d) for d in data]

In [14]:
len(data)

87843

# Subject Lines Dataset Class

In [15]:
class Subject:
    
    kid = 0    
    def __init__(self, *args):
        self.id = Subject.kid
        Subject.kid+=1
        self.subject = None
        self.targets = None
        if len(args)==1:
            self.subject = args[0]
            self.targets = []        
        elif len(args)==2:
            self.subject = args[0]
            if isinstance(args[1], str):
                self.targets = [args[1]]
            else:
                self.targets = args[1]
    
    def getID(self):
        '''Returns subject id'''
        return self.id
    
    def getData(self):
        '''Returns subject id, original and target subject lines'''
        return self.id, self.subject, self.targets
    
    def getOrigSubject(self):
        '''Returns subject line'''
        return self.subject
    
    def setOrigSubject(self, s):
        '''Takes a new subject as input and updates the local variable'''
        self.subject = s
    
    def getTargetSubjects(self):
        '''Returns target subject lines'''
        return self.targets
    
    def addTargetSubjects(self, s):
        '''Args: List of target subjects. Adds to target subject lines.'''
        self.targets.extend(s)

In [16]:
class SubjectLines:
    def __init__(self, *args):
        self.subxID = dict()
        self.IDxsub = dict()
        if len(args)==1:
            self.addSubjects(args[0])   
        elif len(args)==2:
            self.addSubTargets(args[0], args[1]) 
    
    def getStats(self):
        '''Returns number of subject lines'''
        return len(list(self.IDxsub))
    
    def getSubject(self, *args):
        '''Returns object instance of Subject class with the given subject id.'''
        if isinstance(args[0], int):
            return self.IDxsub[args[0]]
        else:
            return self.subxID[args[0]]
    
    def getID(self, sub):
        '''Returns id of the given subject.'''
        if sub in self.subxID.keys():
            return self.subxID[sub]
        else:
            print("Not Found")
    
    def addSubjects(self, subjects):
        '''Adds the input subject lines to the class instance variable'''
        for sub in subjects:
            s = Subject(sub)
            self.IDxsub[s.getID()] = s 
            self.subxID[sub] = s.getID()
    
    def addSubTargets(self, subjects, targets):
        '''Adds subjects and their targets'''
        for sub, tar in zip(subjects, targets):
            s = Subject(sub, tar)
            self.IDxsub[s.getID()] = s 
            self.subxID[sub] = s.getID()
    
    def addTarget(self, *args):
        '''
        Args (Multiple Options):
        1. (id, target)
        2. (id, targets)
        3. (sub, target)
        4. (sub, targets)
        '''
        s=None
        if isinstance(args[0], int):
            s = self.IDxsub[args[0]]
        else:
            s = self.subxID[args[0]]
        if isinstance(args[1], str):
            s.addTargetSubjects([args[1]])
        else:            
            s.addTargetSubjects(args[1])
    
    def getAllSubjects(self):
        '''Returns a dictionary of all subjects, with key as subject id and value as the corresponding subject line'''
        return self.IDxsub

In [336]:
subjects = SubjectLines(data, targets)

In [337]:
subjects.getStats()

87843

In [338]:
subjects.getID(data[0])

0

In [343]:
import pickle
pickle.dump(subjects, open( "data/Client15.p", "wb" ))

In [22]:
import pickle
test = pickle.load(open( "data/Client15.p", "rb" ))

In [23]:
test.getStats()

87843

## Training Data

#### Load Glove Vectors 

In [17]:
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
from numpy.linalg import norm

In [18]:
import os
import shutil
import smart_open
from sys import platform

import gensim


def prepend_line(infile, outfile, line):
    """ 
    Function use to prepend lines using bash utilities in Linux. 
    (source: http://stackoverflow.com/a/10850588/610569)
    """
    with open(infile, 'r') as old:
        with open(outfile, 'w') as new:
            new.write(str(line) + "\n")
            shutil.copyfileobj(old, new)

def prepend_slow(infile, outfile, line):
    """
    Slower way to prepend the line by re-creating the inputfile.
    """
    with open(infile, 'r') as fin:
        with open(outfile, 'w') as fout:
            fout.write(line + "\n")
            for line in fin:
                fout.write(line)


def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims

# Input: GloVe Model File
# More models can be downloaded from http://nlp.stanford.edu/projects/glove/
glove_file='./glove.6B/glove.6B.300d.txt'

num_lines, dims = get_lines(glove_file)

# Output: Gensim Model text format.
gensim_file='./glove.6B/glove.6B.300d.new.txt'
gensim_first_line = "{} {}".format(num_lines, dims)

# Prepends the line.
if platform == "linux" or platform == "linux2":
    prepend_line(glove_file, gensim_file, gensim_first_line)
else:
    prepend_slow(glove_file, gensim_file, gensim_first_line)

In [19]:
from gensim.models.keyedvectors import KeyedVectors
glove_model = KeyedVectors.load_word2vec_format(gensim_file,binary=False)

### Text Preprocessing

In [205]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

In [206]:
import nltk
t = []
for i in data:
    t.append(w2v_tokenize_text(i))

### Transformer  Based Target Generation

In [208]:
import transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained(os.path.join("./Outputs", "Model_v0.10.1"), local_files_only=True)
model = T5ForConditionalGeneration.from_pretrained(os.path.join("./Outputs", "Model_v0.10.1"), local_files_only=True)

In [209]:
def generate_subject(subject, model, tokenizer):
    '''Generate target subject lines using the input transformer model and tokenizer.'''
    inputs = tokenizer.encode(subject, return_tensors='pt')
    outputs = model.generate(inputs, max_length=30, do_sample=True, top_k=100)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('Input Subject: ', subject)
    print('Output Subject: ', text)
    return text

In [210]:
def generate_subject_random(subject, model, tokenizer, temp, verbose=True):
    '''Generate target subject lines using the input transformer model and tokenizer. 
    Randomization is added by the temp parameter.'''
    inputs = tokenizer.encode(subject, return_tensors='pt')
    outputs = model.generate(inputs, max_length=30, do_sample=True, temperature=temp, top_k=100)
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    if verbose:
        print('Input Subject: ', subject)
        print('Output Subject: ', text)
    return text

In [281]:
generate_subject_random(data[np.random.randint(0, len(data))], model, tokenizer, 1)

Input Subject:  sue grafton s new kinsey millhone mystery y is for yesterday pre order now
Output Subject:  s silver watch collection £6 m morgan millhone mystery y is for yesterday pre order now


's silver watch collection £6 m morgan millhone mystery y is for yesterday pre order now'

In [285]:
generate_subject_random(data[10], model, tokenizer, 1.5)

Input Subject:  join us for a conversation is a recovery in sight ? lessons from around the world
Output Subject:  : Dr Bick Clement, Bock Côte'07! • If the conference you have come across in person...


": Dr Bick Clement, Bock Côte'07! • If the conference you have come across in person..."

In [212]:
# inputs = tokenizer.encode(data, return_tensors='pt')
# outputs = model.generate(inputs, max_length=30, do_sample=True, temperature=temp, top_k=100)
# text = tokenizer.decode(outputs , skip_special_tokens=True)

In [214]:
targets = [generate_subject_random(d, model, tokenizer, 1, verbose=False) for d in data]

In [215]:
targets1 = [generate_subject_random(d, model, tokenizer, 1, verbose=False) for d in data]

KeyboardInterrupt: 

In [None]:
targets2 = [generate_subject_random(d, model, tokenizer, 1, verbose=False) for d in data]

In [None]:
targets3 = [generate_subject_random(d, model, tokenizer, 1, verbose=False) for d in data]

## Training and Testing data

In [348]:
from tqdm import tqdm

def prepareData(data, targetSub, word2vec, encoder=True, top_k=5, subLen=1000):
    '''Generate Subject Line embeddings using GLove Embeddings.'''
    encoder_max_seq_length = 0
    decoder_max_seq_length = 0
    inp_embed = []
    out_embed = []
    clean_subjects = data    
    print("Read %s input subject lines...." % len(data))  
    for i in tqdm(clean_subjects):
        temp = []
        for word in i.split(' '):
            if word not in word2vec.keys():
                word2vec[word] = np.random.randn(300)
            temp.append(word2vec[word])
        inp_embed.append(temp)
        encoder_max_seq_length = max(len(temp), encoder_max_seq_length)
    print("Generated %s subject line embeddings (Glove)" % len(inp_embed)) 
    print("Read %s target subject lines...." % len(targetSub)) 
    for i in tqdm(targetSub):
        temp = []
        for word in i.split(' '):
            if word not in word2vec.keys():
                word2vec[word] = np.random.randn(300)
            temp.append(word2vec[word])
        out_embed.append(temp)
        decoder_max_seq_length = max(len(i)+2, decoder_max_seq_length)
    print("Generated %s subject line embeddings (Glove)" % len(inp_embed)) 
    print()
    
    if encoder:
        return clean_subjects, inp_embed, out_embed, word2vec, encoder_max_seq_length, decoder_max_seq_length
    
    if not encoder:
        embeddings = list(word2vec.values())    
        print("Generating Similar subject lines...")
        output_subjects = []
        for i, clean_subject in enumerate(tqdm(clean_subjects)):
            if i<subLen:
                output = get_top_subj_eu(clean_subjects, i, top_k)  
                output_subjects.append(output)

        print("Saved input-output data")

        return clean_subjects, inp_embed, output_subjects, word2vec

In [349]:
input_subjects, inp_embed, out_embed, word2vec, encoder_max_seq_length, decoder_max_seq_length = prepareData(data, targets, word2vector, encoder=True, top_k=5, subLen=1000)
# input_subjects1, inp_embed1, out_embed1, word2vec1, encoder_max_seq_length1, decoder_max_seq_length1 = prepareData(data, targets1, word2vector, encoder=True, top_k=5, subLen=1000)

Read 87843 input subject lines....


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 87843/87843 [00:00<00:00, 288939.66it/s]


Generated 87843 subject line embeddings (Glove)
Read 87843 target subject lines....


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 87843/87843 [00:01<00:00, 69560.25it/s]

Generated 87843 subject line embeddings (Glove)






In [350]:
print('Model 1 parameters: Encoder max seq length=', encoder_max_seq_length, ', Decoder max seq length=', decoder_max_seq_length)
# print('Model 2 parameters: Encoder max seq length=', encoder_max_seq_length1, ', Decoder max seq length=', decoder_max_seq_length1)

Model 1 parameters: Encoder max seq length= 55 , Decoder max seq length= 362


In [351]:
from collections import Counter

def getTarCount(input_subjects, targets):
    '''Get character counts - to generate custom input embeddings'''
    tar_count = Counter()
    for subj, t in zip(input_subjects, targets):
        input_words = [w for w in nltk.word_tokenize(subj.lower())]
        target_text = '\t' + t + '\n'
        for char in target_text:
            tar_count[char] += 1
    return tar_count

tar_count = getTarCount(input_subjects, targets)
# tar_count1 = getTarCount(input_subjects, targets1)

In [352]:
def decoderVariables(tar_count):
    '''Generate number of decoder tokens, word-to-id and id-to-word dictionaries for decoder based on target subject lines.'''
    target_word2idx = dict()

    for idx, word in enumerate(tar_count.most_common(MAX_VOCAB_SIZE)):
        target_word2idx[word[0]] = idx

    target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

    num_decoder_tokens = len(target_idx2word)
    return num_decoder_tokens, target_idx2word, target_word2idx

num_decoder_tokens, target_idx2word, target_word2idx = decoderVariables(tar_count)
# num_decoder_tokens1, target_idx2word1, target_word2idx1 = decoderVariables(tar_count1)

In [353]:
def getDecoderInpOut(targets, decoder_max_seq_length, num_decoder_tokens, subLen=1000):
    '''
    Generating input and output target vectors for decoder from target subject lines.
    '''
    decoder_target_data = np.zeros(shape=(1000, decoder_max_seq_length, num_decoder_tokens))
    decoder_input_data = np.zeros(shape=(1000, decoder_max_seq_length, num_decoder_tokens))
    for lineIdx, t in enumerate(targets):
        if lineIdx<subLen:
            target = '\t' + t + '\n'
            for idx, char in enumerate(target):
                if char in target_word2idx:
                    w2idx = target_word2idx[char]
                    decoder_input_data[lineIdx, idx, w2idx] = 1
                    if idx > 0:
                        decoder_target_data[lineIdx, idx-1, w2idx] = 1
    return decoder_target_data, decoder_input_data

decoder_target_data, decoder_input_data = getDecoderInpOut(targets, decoder_max_seq_length, num_decoder_tokens)
# decoder_target_data1, decoder_input_data1 = getDecoderInpOut(targets1, decoder_max_seq_length1, num_decoder_tokens1)

In [354]:
context = dict()
context['num_decoder_tokens'] = num_decoder_tokens
context['encoder_max_seq_length'] = encoder_max_seq_length
context['decoder_max_seq_length'] = decoder_max_seq_length

# Trial 2

In [None]:
for d, t in :
    input_words = [w for w in nltk.word_tokenize(.lower())]
    target_text = '\t' + target_text + '\n'
    for char in target_text:
        tar_count[char] += 1
target_word2idx = dict()

for idx, word in enumerate(tar_count.most_common(MAX_VOCAB_SIZE)):
    #print(word)
    target_word2idx[word[0]] = idx

target_idx2word = dict([(idx, word) for word, idx in target_word2idx.items()])

num_decoder_tokens = len(target_idx2word)

In [385]:
unknown_emb = np.random.randn(EMBEDDING_SIZE)
encoder_max_seq_length = 0
decoder_max_seq_length = 0

input_texts_word2em = []

In [390]:
for s, t in zip(data, targets):
    target_text = '\t' + t + '\n'
    input_words = [w for w in nltk.word_tokenize(s.lower())]
    encoder_input_wids = []
    for w in input_words:
        em = unknown_emb
        try:
            em = glove_model.get_vector(w)
        except:
            encoder_input_wids.append(em)
    input_texts_word2em.append(encoder_input_wids)
    encoder_max_seq_length = max(len(encoder_input_wids), encoder_max_seq_length)
    decoder_max_seq_length = max(len(target_text), decoder_max_seq_length)

In [391]:
encoder_input_data = pad_sequences(input_texts_word2em, encoder_max_seq_length)

In [None]:
decoder_target_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens))
decoder_input_data = np.zeros(shape=(NUM_SAMPLES, decoder_max_seq_length, num_decoder_tokens))
for lineIdx, t in enumerate(targets):
    target = '\t' + t + '\n'
    for idx, char in enumerate(t):
        if char in target_word2idx:
            w2idx = target_word2idx[char]
            decoder_input_data[lineIdx, idx, w2idx] = 1
            if idx > 0:
                decoder_target_data[lineIdx, idx-1, w2idx] = 1

In [355]:
import tensorflow as tf

In [356]:
from keras.models import Model
from keras.layers.recurrent import LSTM
from keras.layers import Dense, Input, Embedding
from keras.preprocessing.sequence import pad_sequences
from collections import Counter
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from keras.activations import softmax
from keras.layers.core import Dense, Activation, RepeatVector, Permute
from keras.layers import Input, Embedding, Multiply, Concatenate, Lambda
from keras.layers.wrappers import TimeDistributed

In [357]:
encoder_input_data = pad_sequences(inp_embed, encoder_max_seq_length)

In [358]:
encoder_inputs = Input(shape=(None, EMBEDDING_SIZE), name='encoder_inputs')
encoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, name='encoder_lstm')
encoder_outputs, encoder_state_h, encoder_state_c = encoder_lstm(encoder_inputs)
encoder_states = [encoder_state_h, encoder_state_c]

decoder_inputs = Input(shape=(None, num_decoder_tokens), name='decoder_inputs')
decoder_lstm = LSTM(units=HIDDEN_UNITS, return_state=True, return_sequences=True, name='decoder_lstm')
decoder_outputs, decoder_state_h, decoder_state_c = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(units=num_decoder_tokens, activation='softmax', name='decoder_dense')
decoder_outputs = decoder_dense(decoder_outputs)

In [359]:
BATCH_SIZE = 64
NUM_EPOCHS = 100
HIDDEN_UNITS = 256
MAX_VOCAB_SIZE = 10000
EMBEDDING_SIZE = 300

In [360]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.fit([encoder_input_data[:1000], decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=10,
          verbose=1, validation_split=0.3)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fea74236a60>

### Encoder inference model

In [361]:
encoder_model_inf = Model(encoder_inputs, encoder_states)

### Decoder inference model

In [362]:
decoder_state_input_h = Input(shape=(HIDDEN_UNITS,))
decoder_state_input_c = Input(shape=(HIDDEN_UNITS,)) 
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder_lstm(decoder_inputs, initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_inputs] + decoder_input_states, outputs=[decoder_out] + decoder_states )

In [363]:
max_encoder_seq_length = context['encoder_max_seq_length']
max_decoder_seq_length = context['decoder_max_seq_length']
num_decoder_tokens = context['num_decoder_tokens']

In [364]:
unknown_emb = np.random.randn(EMBEDDING_SIZE)

In [365]:
def predict_sent(input_text):
        input_seq = []
        input_wids = []
        for word in nltk.word_tokenize(input_text.lower()):
            emb = unknown_emb
            if word in word2vec:
                emb = word2vec[word]
            input_wids.append(emb)
        input_seq.append(input_wids)
        input_seq = pad_sequences(input_seq, max_encoder_seq_length)
        states_value = encoder_model_inf.predict(input_seq)
        target_seq = np.zeros((1, 1,num_decoder_tokens))
        target_seq[0, 0, target_word2idx['\t']] = 1
        target_text = ''
        terminated = False
        while not terminated:
            output_tokens, h, c = decoder_model_inf.predict([target_seq] + states_value)

            sample_token_idx = np.argmax(output_tokens[0, -1, :])
            sample_word = target_idx2word[sample_token_idx]
            target_text += sample_word

            if sample_word == '\n' or len(target_text) >= max_decoder_seq_length:
                terminated = True

            target_seq = np.zeros((1, 1, num_decoder_tokens))
            target_seq[0, 0, sample_token_idx] = 1

            states_value = [h, c]
        return target_text.strip()

In [366]:
print('Input Subject:', input_subjects[100])
print('Output Subject:', targets[100])

Input Subject: critical vendor risk management join us in san diego ca
Output Subject: ca a ca - tumultuous ca do not get to know our partners in san diego called


In [370]:
np.random.randint(0,len(data))

91

In [380]:
ind = np.random.randint(0,len(data))
print('Input Subject:', input_subjects[ind])
print('Target Subject:', targets[ind])
print('Output Subject:', predict_sent(input_subjects[ind]))

Input Subject: you re invited baby shark s big show is here
Target Subject: s s got the shark s big show. You re invited baby shark s big show is here baby shark s
Output Subject: e


In [381]:
ind = np.random.randint(0,len(data))
print('Input Subject:', input_subjects[ind])
print('Target Subject:', targets[ind])
print('Output Subject:', predict_sent(input_subjects[ind]))

Input Subject: in the air there s a feeling . . .of a one day sale
Target Subject: somewhere in the middle of my mind I'm just writing and now im typing it up.
Output Subject: e


In [382]:
ind = np.random.randint(0,len(data))
print('Input Subject:', input_subjects[ind])
print('Target Subject:', targets[ind])
print('Output Subject:', predict_sent(input_subjects[ind]))

Input Subject: mountain adventures await approval
Target Subject: . mountain adventures await approval before approval..."Mountain adventures await approval before approval for approval!". A selection of mountain adventures for
Output Subject: e


In [383]:
ind = np.random.randint(0,len(data))
print('Input Subject:', input_subjects[ind])
print('Target Subject:', targets[ind])
print('Output Subject:', predict_sent(input_subjects[ind]))

Input Subject: summary report partner support rollover ira acceptance letter ops 
Target Subject: report partner supporting rollover partner support rollover partner confidence rate partner acceptance letter ops ra approval letter partner support rollover partner support
Output Subject: e


In [384]:
ind = np.random.randint(0,len(data))
print('Input Subject:', input_subjects[ind])
print('Target Subject:', targets[ind])
print('Output Subject:', predict_sent(input_subjects[ind]))

Input Subject:  be bold . open for savings
Target Subject: . be bold.. is one bold move. be bold. be bold. be bold. be bold
Output Subject: e


# Trial-4

In [199]:
class SubjectsDataset:
    def __init__(self):
        self.tokenizer = None

    def unicode_to_ascii(self, s):
        return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

    ## Step 1 and Step 2 
    def preprocess_sentence(self, w):
        w = self.unicode_to_ascii(w.lower().strip())

        # creating a space between a word and the punctuation following it
        # eg: "he is a boy." => "he is a boy ."
        # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
        w = re.sub(r"([?.!,¿])", r" \1 ", w)
        w = re.sub(r'[" "]+', " ", w)

        # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
        w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)

        w = w.strip()

        # adding a start and an end token to the sentence
        # so that the model know when to start and stop predicting.
        w = '<start> ' + w + ' <end>'
        return w

    def create_dataset(self, path, num_examples):
        # path : path to spa-eng.txt file
        # num_examples : Limit the total number of training example for faster training (set num_examples = len(lines) to use full data)
        data = pickle.load(open(path, "rb" ))
        subs = test.getAllSubjects()
        word_pairs = [[self.preprocess_sentence(sub.getOrigSubject()), self.preprocess_sentence(sub.getTargetSubjects()[0])]  for i, sub in subs.items()]
        return zip(*word_pairs)

    # Step 3 and Step 4
    def tokenize(self, encoder_text, decoder_text):
        
        # print(len(lang), "example sentence: {}".format(lang[0]))
        tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
        tokenizer.fit_on_texts(encoder_text)
        tokenizer.fit_on_texts(decoder_text)

        ## tf.keras.preprocessing.text.Tokenizer.texts_to_sequences converts string (w1, w2, w3, ......, wn) 
        ## to a list of correspoding integer ids of words (id_w1, id_w2, id_w3, ...., id_wn)
        encoder_sequences = tokenizer.texts_to_sequences(encoder_text)
        decoder_sequences = tokenizer.texts_to_sequences(decoder_text)

        ## tf.keras.preprocessing.sequence.pad_sequences takes argument a list of integer id sequences 
        ## and pads the sequences to match the longest sequences in the given input
        padded_encoder_seq = tf.keras.preprocessing.sequence.pad_sequences(encoder_sequences, padding='post')
        padded_decoder_seq = tf.keras.preprocessing.sequence.pad_sequences(decoder_sequences, padding='post')
        
        return padded_encoder_seq, padded_decoder_seq, tokenizer

    def load_dataset(self, path, num_examples=None):
        # creating cleaned input, output pairs
        inp, tar = self.create_dataset(path, num_examples)
        input_encoder_tensor, input_decoder_tensor, tokenizer = self.tokenize(inp, tar)

        return input_encoder_tensor, input_decoder_tensor, tokenizer

    def call(self, file_path, num_examples, BUFFER_SIZE, BATCH_SIZE):
        input_tensor, target_tensor, self.tokenizer = self.load_dataset(file_path, num_examples)

        input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)

        train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
        train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

        val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
        val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

        return train_dataset, val_dataset, self.tokenizer

In [200]:
BUFFER_SIZE = 32000
BATCH_SIZE = 64
num_examples = 30000
path = 'data/Client15.p'

dataset_creator = SubjectsDataset()
train_dataset, val_dataset, sub_tokenizer = dataset_creator.call(path, num_examples, BUFFER_SIZE, BATCH_SIZE)

In [201]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([64, 56]), TensorShape([64, 348]))

In [202]:
vocab_size = len(sub_tokenizer.word_index)+1
max_length_input = example_input_batch.shape[1]
max_length_output = example_target_batch.shape[1]

embedding_dim = 256
units = 1024
steps_per_epoch = num_examples//BATCH_SIZE

In [204]:
vocab_size, max_length_input, max_length_output

(70420, 56, 348)

In [205]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        ##-------- LSTM layer in Encoder ------- ##
        self.lstm_layer = tf.keras.layers.LSTM(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')



    def call(self, x, hidden):
        x = self.embedding(x)
        output, h, c = self.lstm_layer(x, initial_state = hidden)
        return output, h, c

    def initialize_hidden_state(self):
        return [tf.zeros((self.batch_sz, self.enc_units)), tf.zeros((self.batch_sz, self.enc_units))]

In [206]:
encoder = Encoder(vocab_size, embedding_dim, units, BATCH_SIZE)


# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_h, sample_c = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder h vecotr shape: (batch size, units) {}'.format(sample_h.shape))
print ('Encoder c vector shape: (batch size, units) {}'.format(sample_c.shape))

Encoder output shape: (batch size, sequence length, units) (64, 56, 1024)
Encoder h vecotr shape: (batch size, units) (64, 1024)
Encoder c vector shape: (batch size, units) (64, 1024)


In [208]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz, attention_type='luong'):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.attention_type = attention_type

        # Embedding Layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)

        #Final Dense layer on which softmax will be applied
        self.fc = tf.keras.layers.Dense(vocab_size)

        # Define the fundamental cell for decoder recurrent structure
        self.decoder_rnn_cell = tf.keras.layers.LSTMCell(self.dec_units)
        
        # Sampler
        self.sampler = tfa.seq2seq.sampler.TrainingSampler()

        # Create attention mechanism with memory = None
        self.attention_mechanism = self.build_attention_mechanism(self.dec_units, 
                                                              None, self.batch_sz*[max_length_input], self.attention_type)
        
        # Wrap attention mechanism with the fundamental rnn cell of decoder
        self.rnn_cell = self.build_rnn_cell(batch_sz)

        # Define the decoder with respect to fundamental rnn cell
        self.decoder = tfa.seq2seq.BasicDecoder(self.rnn_cell, sampler=self.sampler, output_layer=self.fc)

    def build_rnn_cell(self, batch_sz):
        rnn_cell = tfa.seq2seq.AttentionWrapper(self.decoder_rnn_cell, 
                                  self.attention_mechanism, attention_layer_size=self.dec_units)
        return rnn_cell

    def build_attention_mechanism(self, dec_units, memory, memory_sequence_length, attention_type='luong'):
        # ------------- #
        # typ: Which sort of attention (Bahdanau, Luong)
        # dec_units: final dimension of attention outputs 
        # memory: encoder hidden states of shape (batch_size, max_length_input, enc_units)
        # memory_sequence_length: 1d array of shape (batch_size) with every element set to max_length_input (for masking purpose)

        if(attention_type=='bahdanau'):
            return tfa.seq2seq.BahdanauAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)
        else:
            return tfa.seq2seq.LuongAttention(units=dec_units, memory=memory, memory_sequence_length=memory_sequence_length)

    def build_initial_state(self, batch_sz, encoder_state, Dtype):
        decoder_initial_state = self.rnn_cell.get_initial_state(batch_size=batch_sz, dtype=Dtype)
        decoder_initial_state = decoder_initial_state.clone(cell_state=encoder_state)
        return decoder_initial_state

    def call(self, inputs, initial_state):
        x = self.embedding(inputs)
        outputs, _, _ = self.decoder(x, initial_state=initial_state, sequence_length=self.batch_sz*[max_length_output-1])
        return outputs

In [211]:
import tensorflow_addons as tfa

 The versions of TensorFlow you are currently using is 2.8.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [212]:
decoder = Decoder(vocab_size, embedding_dim, units, BATCH_SIZE, 'luong')
sample_x = tf.random.uniform((BATCH_SIZE, max_length_output))
decoder.attention_mechanism.setup_memory(sample_output)
initial_state = decoder.build_initial_state(BATCH_SIZE, [sample_h, sample_c], tf.float32)


sample_decoder_outputs = decoder(sample_x, initial_state)

print("Decoder Outputs Shape: ", sample_decoder_outputs.rnn_output.shape)

Decoder Outputs Shape:  (64, 347, 70420)


In [214]:
optimizer = tf.keras.optimizers.Adam()


def loss_function(real, pred):
    # real shape = (BATCH_SIZE, max_length_output)
    # pred shape = (BATCH_SIZE, max_length_output, tar_vocab_size )
    cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = cross_entropy(y_true=real, y_pred=pred)
    mask = tf.logical_not(tf.math.equal(real,0))   #output 0 for y=0 else output 1
    mask = tf.cast(mask, dtype=loss.dtype)  
    loss = mask* loss
    loss = tf.reduce_mean(loss)
    return loss

In [215]:
checkpoint_dir = './checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)

In [216]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_h, enc_c = encoder(inp, enc_hidden)


        dec_input = targ[ : , :-1 ] # Ignore <end> token
        real = targ[ : , 1: ]         # ignore <start> token

        # Set the AttentionMechanism object with encoder_outputs
        decoder.attention_mechanism.setup_memory(enc_output)

        # Create AttentionWrapperState as initial_state for decoder
        decoder_initial_state = decoder.build_initial_state(BATCH_SIZE, [enc_h, enc_c], tf.float32)
        pred = decoder(dec_input, decoder_initial_state)
        logits = pred.rnn_output
        loss = loss_function(real, logits)

    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return loss

In [None]:
import time
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    # print(enc_hidden[0].shape, enc_hidden[1].shape)

    for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 25 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                       batch,
                                                       batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)

    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 0.3964
Epoch 1 Batch 25 Loss 0.3881
Epoch 1 Batch 50 Loss 0.3742
Epoch 1 Batch 75 Loss 0.3396
Epoch 1 Batch 100 Loss 0.3848


In [8]:
def evaluate_subject(sentence):
    sentence = dataset_creator.preprocess_sentence(sentence)

    inputs = [sub_tokenizer.word_index[i] for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen=max_length_input,
                                                          padding='post')
    inputs = tf.convert_to_tensor(inputs)
    inference_batch_size = inputs.shape[0]
    result = ''

    enc_start_state = [tf.zeros((inference_batch_size, units)), tf.zeros((inference_batch_size,units))]
    enc_out, enc_h, enc_c = encoder(inputs, enc_start_state)

    dec_h = enc_h
    dec_c = enc_c

    start_tokens = tf.fill([inference_batch_size], targ_lang.word_index['<start>'])
    end_token = targ_lang.word_index['<end>']

    greedy_sampler = tfa.seq2seq.GreedyEmbeddingSampler()

    # Instantiate BasicDecoder object
    decoder_instance = tfa.seq2seq.BasicDecoder(cell=decoder.rnn_cell, sampler=greedy_sampler, output_layer=decoder.fc)
    # Setup Memory in decoder stack
    decoder.attention_mechanism.setup_memory(enc_out)

    # set decoder_initial_state
    decoder_initial_state = decoder.build_initial_state(inference_batch_size, [enc_h, enc_c], tf.float32)


    ### Since the BasicDecoder wraps around Decoder's rnn cell only, you have to ensure that the inputs to BasicDecoder 
    ### decoding step is output of embedding layer. tfa.seq2seq.GreedyEmbeddingSampler() takes care of this. 
    ### You only need to get the weights of embedding layer, which can be done by decoder.embedding.variables[0] and pass this callabble to BasicDecoder's call() function

    decoder_embedding_matrix = decoder.embedding.variables[0]

    outputs, _, _ = decoder_instance(decoder_embedding_matrix, start_tokens = start_tokens, end_token= end_token, initial_state=decoder_initial_state)
    return outputs.sample_id.numpy()

def predict_sub(sentence):
    result = evaluate_subject(sentence)
    print(result)
    result = sub_tokenizer.sequences_to_texts(result)
    print('Input: %s' % (sentence))
    print('Predicted translation: {}'.format(result))

In [9]:
predict_sub(u'join us for a conversation is a recovery in sight ? lessons from around the world.')

NameError: name 'dataset_creator' is not defined

# Trial-6