## Data Processing: Loading in reviews data and process with GloVe

In [1]:
##Import pandas and set display option
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

In [2]:
##load Amazon reviews dataset
df = pd.read_csv('Reviews.csv')
df.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
#dropping empty cells and dupes 
text_summary = df.filter(items =['Summary','Text'])
text_summary = text_summary.dropna()
text_summary = text_summary.drop_duplicates()

In [4]:
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [5]:
##Data cleaning to remove unwanted symbols
import re
def clean_text(sentences):
    clean=[]
    
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''    
    for sentence in sentences:
        for key in contractions.keys():
            sentence = sentence.lower().replace(key, contractions[key])
        sentence = sentence.lower()
        sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
        sentence = re.sub(r'\<a href', ' ', sentence)
        sentence = re.sub(r'&amp;', '', sentence) 
        sentence = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', sentence)
        sentence = re.sub(r'<br />', ' ', sentence)
        sentence = re.sub(r'\'', ' ', sentence)
        
        words = [word for word in sentence.split()]
        clean.append(words)
    
    return clean

In [6]:
# Clean the summaries and texts
Texts = clean_text(text_summary["Text"])
Summaries = clean_text(text_summary["Summary"])

In [7]:
## Verify that one entry of the clearned data
Texts[1]

['product',
 'arrived',
 'labeled',
 'as',
 'jumbo',
 'salted',
 'peanuts',
 'the',
 'peanuts',
 'were',
 'actually',
 'small',
 'sized',
 'unsalted',
 'not',
 'sure',
 'if',
 'this',
 'was',
 'an',
 'error',
 'or',
 'if',
 'the',
 'vendor',
 'intended',
 'to',
 'represent',
 'the',
 'product',
 'as',
 'jumbo']

In [8]:
##Function to count word frequency
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence:
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [9]:
## Apply functino above to generate the word frequency for each word and get vocab size in dataset
word_counts = {}

count_words(word_counts, Summaries)
count_words(word_counts, Texts)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 132873


In [10]:
## Understand how long summaries and texts generally are, so that we can exclude outliers later
def create_lengths(text):
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [11]:
##printing for verification
lengths_summaries = create_lengths(Summaries)
lengths_texts = create_lengths(Texts)

print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())

Summaries:
         counts
count 394967.00
mean       4.16
std        2.65
min        0.00
25%        2.00
50%        4.00
75%        5.00
max       48.00

Texts:
         counts
count 394967.00
mean      80.58
std       77.55
min        0.00
25%       34.00
50%       57.00
75%       99.00
max     3540.00


In [12]:
# import wget
# wget.download("http://nlp.stanford.edu/data/glove.6B.zip")
# import zipfile
# zf=zipfile.ZipFile("glove.6B.zip")
# zf.extractall()

In [13]:
# Load GloVe
#Can select from 50, 100, 200 and 300 for dimensions
import numpy as np
def parse_glove(dimension):
    filename = "glove.6B.{:d}d.txt".format(dimension)
    embeddings_index={}
    file = open(filename,'r')
    for line in file.readlines():
        row=line.strip().split(' ')
        word = row[0]
        embedding = np.asarray(row[1:], dtype='float32')
        embeddings_index[word] = embedding
    print('Word embeddings:', len(embeddings_index))
    file.close()
    return embeddings_index

In [14]:
embedding_dim = 200 #set default embedding dimension to 200 
embeddings_index = parse_glove(embedding_dim)

Word embeddings: 400000


In [15]:
## Find the number of words that are missing from our data from GloVe, and appears more often than our set threshold
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing|: {}%".format(missing_ratio))

Number of words missing from GloVe: 4171
Percent of words that are missing|: 3.1399999999999997%


In [16]:
## create mapping to map each word to an integer ID
word_to_id = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        word_to_id[word] = value
        value += 1

## Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

## Add tokens to vocab
for code in codes:
    word_to_id[code] = len(word_to_id)

## Create mappingn for ID to word
id_to_word = {}
for word, value in word_to_id.items():
    id_to_word[value] = word


print("Number of unique words:", len(word_counts))
print("Number of words we will use:", len(word_to_id))

Number of unique words: 132873
Number of words we will use: 65342


In [17]:
## function to find nearest neighbor for unknown words
W = list(embeddings_index.values())
vocab = list(embeddings_index.keys())
def nearest_neighbor(word):
    v= embeddings_index[word]
    dot_product = np.dot(W,v)
    A = np.sqrt(np.sum(np.square(v),0))
    B = np.sqrt(np.sum(np.square(W),1))
    denominator = np.multiply(A,B)
    cosine_similarities = np.divide(dot_product,denominator)
    v_sim = W[np.argmax(cosine_similarities)]
    for x in range(0,len(W)):
        if np.array_equal(W[x],np.asarray(v_sim)):
            return vocab[x]

In [18]:
##Createa a reduced embedding dictionary
word_embedding_matrix = np.zeros((len(word_to_id), embedding_dim), dtype=np.float32)
for word, i in word_to_id.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in GloVe, we start with a randomized embedding
        rand_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = rand_embedding
        word_embedding_matrix[i] = rand_embedding
## We also attempted to use the nearest neighbor for unknown words (function above)
## We found this reduces speed rather significantly, and given the low % of NNs, we proceeded with the randomized embedding

In [19]:
## Convert words to IDs (we do not convert directly to embeddings to increase the performance)
def convert_to_ids(text, word_n, unk_n, eos=False): 
    ids = []
    for sentence in text:
        sentence_ids = []
        for word in sentence:
            word_n += 1
            if word in word_to_id:
                sentence_ids.append(word_to_id[word])
            else:
                sentence_ids.append(word_to_id["<UNK>"])
                unk_n += 1
        if eos:
            sentence_ids.append(word_to_id["<EOS>"])
        ids.append(sentence_ids)
    return ids, word_n, unk_n

In [20]:
## Apply the convert_to_ids function
word_n = 0
unk_n = 0

id_summaries, word_n, unk_n_sum = convert_to_ids(Summaries, word_n, unk_n)
id_texts, word_n, unk_n_text = convert_to_ids(Texts, word_n, unk_n, eos=True)
unk_percent = round((unk_n_sum+unk_n_text)/word_n,4)*100

print("Percent of UNK: {}%".format(unk_percent))

Percent of UNK: 0.35000000000000003%


In [21]:
def count_unk(text):
    unk_n = text.count(word_to_id["<UNK>"])
    return unk_n

In [22]:
# Sort the sentences by length to reduce padding
sorted_id_summaries = []
sorted_id_texts = []
max_text_length = 100 #75% percentile
min_summary_length = 2 #want to produce summaries more than 2 words
min_text_length = 10 #ensure meaningful text worth summarizing
max_unk_text = 2
max_unk_summary = 0

for length in range(min(lengths_texts.counts), max_text_length): 
    for i, words in enumerate(id_summaries):
        if (len(id_summaries[i]) >= min_summary_length and
            len(id_texts[i])>= min_text_length and
            len(id_texts[i])<= max_text_length and
            count_unk(id_summaries[i])<=max_unk_summary and
            count_unk(id_texts[i])<=max_unk_text and
            len(id_texts[i]) == length):
            
            sorted_id_summaries.append(id_summaries[i])
            sorted_id_texts.append(id_texts[i])
            
print(len(sorted_id_summaries))


251405


In [23]:
## Saving the convered IDs for texts and summaries so we don't have to rerun data processing again
np.save("id_summaries",sorted_id_summaries)
np.save("id_texts",sorted_id_texts)

In [24]:
## Use when loading saved data
# id_summaries = np.load('id_summaries.npy')
# id_texts = np.load('id_texts.npy')

## Model Building

In [25]:
#!pip install --upgrade tensorflow
import tensorflow as tf
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.10.0


In [26]:
## function to create tf.placeholders for hyper parameters and model inputs
def model_inputs():
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [27]:
## Function to process input for decoding input (adding <GO> to begining of batch)
def process_decoding_input(target_data, word_to_id, batch_size):
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], word_to_id['<GO>']), ending], 1)

    return dec_input

In [28]:
## Create encoding layer function - can choose between bi-direction or one direction LSTM
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob, direction):
    '''Create the encoding layer'''
    
    if direction == 1:
        with tf.name_scope("RNN_Encoder_Cell_1D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    lstm = tf.contrib.rnn.LSTMCell(rnn_size)

                    drop = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.dynamic_rnn(drop, 
                                                              rnn_inputs,
                                                              sequence_length,
                                                              dtype=tf.float32)

            return enc_output, enc_state
        
        
    if direction == 2:
        with tf.name_scope("RNN_Encoder_Cell_2D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    cell_fw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                            input_keep_prob = keep_prob)

                    cell_bw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                            input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                            cell_bw, 
                                                                            rnn_inputs,
                                                                            sequence_length,
                                                                            dtype=tf.float32)
            enc_output = tf.concat(enc_output,2)
            return enc_output, enc_state[0]

In [29]:
## Create training logits
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_summary_length):
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 

    training_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)    
    
    return training_logits

In [30]:

def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
                
    inference_logits, _ ,_  = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    
    return inference_logits

In [31]:
## Decoding layer with attention (Bahdanau) for training and inference
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, word_to_id, keep_prob, batch_size, num_layers, direction):
    
    with tf.name_scope("RNN_Decoder_Cell"):
        for layer in range(num_layers):
            with tf.variable_scope('decoder_{}'.format(layer)):
                lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
                dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    with tf.name_scope("Attention_Wrapper"):
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,attn_mech,rnn_size)
            
        
    initial_state = dec_cell.zero_state(batch_size,dtype=tf.float32)
    initial_state = initial_state.clone(cell_state = enc_state)
    
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,  
                                                    word_to_id['<GO>'], 
                                                    word_to_id['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)

    return training_logits, inference_logits

In [32]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, word_to_id, batch_size, direction):

    embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob, direction)
    
    dec_input = process_decoding_input(target_data, word_to_id, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        word_to_id, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers, direction)
    
    return training_logits, inference_logits

In [33]:
## Create padding so each batch has the same sentence length
def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [word_to_id['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [34]:
def get_batches(summaries, texts, batch_size):
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [35]:
# Set hyperparameters
epochs = 5 ## We initially wanted to run for 10+ epochs, but couldn't afford to do to extremely long traning time
batch_size = 32
rnn_size = 128
num_layers = 1
learning_rate = 0.008
keep_probability = 0.8
direction = 2

In [36]:
# Build the graph
train_graph = tf.Graph()
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(word_to_id)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      word_to_id,
                                                      batch_size, 
                                                      direction)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("cost"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Success")

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
Success


In [37]:
## Create a training subset
start = 0
end = start + 10000
Summaries_short = id_summaries[start:end]
Texts_short = id_texts[start:end]


In [52]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 10 # Check training loss
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(Texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model


with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(Summaries_short, Texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(Texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('Improved Result') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    save_path = saver.save(sess, "./Checkpoint/model.ckpt")

                else:
                    print("No Improvement")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/5 Batch   10/312 - Loss:  6.567, Seconds: 152.67
Epoch   1/5 Batch   20/312 - Loss:  3.298, Seconds: 133.78
Epoch   1/5 Batch   30/312 - Loss:  3.480, Seconds: 70.72
Epoch   1/5 Batch   40/312 - Loss:  2.920, Seconds: 240.64
Epoch   1/5 Batch   50/312 - Loss:  2.673, Seconds: 145.80
Epoch   1/5 Batch   60/312 - Loss:  2.561, Seconds: 91.69
Epoch   1/5 Batch   70/312 - Loss:  2.737, Seconds: 91.72
Epoch   1/5 Batch   80/312 - Loss:  2.966, Seconds: 91.75
Epoch   1/5 Batch   90/312 - Loss:  2.584, Seconds: 196.65
Epoch   1/5 Batch  100/312 - Loss:  2.760, Seconds: 122.95
Average loss for this update: 3.235
New Record!
Epoch   1/5 Batch  110/312 - Loss:  2.563, Seconds: 123.32
Epoch   1/5 Batch  120/312 - Loss:  2.721, Seconds: 165.00
Epoch   1/5 Batch  130/312 - Loss:  2.741, Seconds: 103.17
Epoch   1/5 Batch  140/312 - Loss:  2.638, Seconds: 132.88
Epoch   1/5 Batch  150/312 - Loss:  2.725, Seconds: 91.29
Epoch   1/5 Batch  160/312 - Loss:  2.541, Seconds: 101.86
Epoch   1/5 B

Epoch   5/5 Batch   70/312 - Loss:  1.715, Seconds: 153.01
Epoch   5/5 Batch   80/312 - Loss:  1.823, Seconds: 152.08
Epoch   5/5 Batch   90/312 - Loss:  1.625, Seconds: 323.80
Epoch   5/5 Batch  100/312 - Loss:  1.761, Seconds: 202.95
Average loss for this update: 1.864
No Improvement.
Epoch   5/5 Batch  110/312 - Loss:  1.684, Seconds: 203.31
Epoch   5/5 Batch  120/312 - Loss:  1.772, Seconds: 272.91
Epoch   5/5 Batch  130/312 - Loss:  1.780, Seconds: 169.90
Epoch   5/5 Batch  140/312 - Loss:  1.765, Seconds: 220.28
Epoch   5/5 Batch  150/312 - Loss:  1.776, Seconds: 153.83
Epoch   5/5 Batch  160/312 - Loss:  1.675, Seconds: 168.85
Epoch   5/5 Batch  170/312 - Loss:  1.855, Seconds: 203.74
Epoch   5/5 Batch  180/312 - Loss:  1.614, Seconds: 235.69
Epoch   5/5 Batch  190/312 - Loss:  1.790, Seconds: 189.45
Epoch   5/5 Batch  200/312 - Loss:  1.743, Seconds: 418.95
Average loss for this update: 1.739
New Record!
Epoch   5/5 Batch  210/312 - Loss:  1.699, Seconds: 173.16
Epoch   5/5 Bat

In [38]:
## Pull a random sentence from the unseen text in data and see the prediction
random = np.random.randint(end,len(Texts))
input_sentence = Texts[random]
text = [word_to_id.get(word, word_to_id['<UNK>']) for word in input_sentence]

checkpoint = "./Checkpoint/model.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 

# Remove the padding from output
pad = word_to_id["<PAD>"] 

print('Original Text:', input_sentence)

print('Original Text:', input_sentence)
print('  Input Words: {}'.format(" ".join([id_to_word[i] for i in text])))

print('\Generated Summary')
print('  Response Words: {}'.format(" ".join([id_to_word[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./Checkpoint/model.ckpt
Original Text: ['i', 'purchased', 'this', 'primarily', 'because', 'dr', 'oz', 'promoted', 'it', 'it', 'is', 'ok', 'but', 'tastes', 'just', 'like', 'the', 'cheap', 'puffed', 'rice', 'from', 'the', 'grocery', 'and', 'it', 'is', 'pricey', 'for', 'this', 'kamut', 'maybe', 'it', 'is', 'super', 'good', 'for', 'you', 'but', '6', 'oz', 'is', 'not', 'a', 'lot']
Original Text: ['i', 'purchased', 'this', 'primarily', 'because', 'dr', 'oz', 'promoted', 'it', 'it', 'is', 'ok', 'but', 'tastes', 'just', 'like', 'the', 'cheap', 'puffed', 'rice', 'from', 'the', 'grocery', 'and', 'it', 'is', 'pricey', 'for', 'this', 'kamut', 'maybe', 'it', 'is', 'super', 'good', 'for', 'you', 'but', '6', 'oz', 'is', 'not', 'a', 'lot']
  Input Words: i purchased this primarily because dr oz promoted it it is ok but tastes just like the cheap puffed rice from the grocery and it is pricey for this kamut maybe it is super good for you but 6 oz is not a lot
\G

## Rouge Metrics

In [39]:
# install the library
!pip install rouge

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[31mtimestring 1.6.2 has requirement pytz==2013b, but you'll have pytz 2018.4 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [40]:
summary_sentence = Summaries[random]
summary_text = [word_to_id.get(word, word_to_id['<UNK>']) for word in summary_sentence]
original_summary = " ".join([id_to_word[i] for i in summary_text])
generated_summary = " ".join([id_to_word[i] for i in answer_logits if i != pad])
print(original_summary)
print(generated_summary)
# original_summary = "this tea is my new alternative after jasmine green tea it is so smooth and light ican drink it anytime"
# generated_summary = "great tea"

ho uhm
not vegetarian


In [41]:
from rouge import Rouge
rouge_calculator = Rouge()
scores = rouge_calculator.get_scores(original_summary, generated_summary)

In [42]:
# Pretty-print the results
#note that this is just for ONE original and generated summary. 
for metric in scores:
    for metric_name, metric_vals in metric.items():
        print("\nMetric Name: {}\nPrecision: {}\nRecall: {}\nF-score: {}".format(
                metric_name, metric_vals['p'], metric_vals['r'], metric_vals['f']))


Metric Name: rouge-1
Precision: 0.0
Recall: 0.0
F-score: 0.0

Metric Name: rouge-2
Precision: 0.0
Recall: 0.0
F-score: 0.0

Metric Name: rouge-l
Precision: 0.0
Recall: 0.0
F-score: 0.0


# ROUGE Metrics (Aggregate)


In [43]:
## Select test texts and infer predicted summaries
##size / number of summaries can be edited to get an idea of overall efficiency of set 
size = 100
test_list = np.random.randint(10000,len(Texts),size = size)
summary_tests = [Summaries[i] for i in test_list]
text_tests = [Texts[i] for i in test_list]
Predicted_Summaries = []
checkpoint = "./Checkpoint/model.ckpt"
pad = word_to_id["<PAD>"] 

text_id_tests = []
for text in text_tests:
    ids = [word_to_id.get(word, word_to_id['<UNK>']) for word in text]
    text_id_tests.append(ids)



In [44]:
len(text_id_tests)

100

In [45]:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    for text in text_id_tests:
        if len(text) == 0:
            Predicted_Summaries.append('')            
        else:
            answer = sess.run(logits, {input_data: [text]*batch_size, 
                                          summary_length: [np.random.randint(5,8)], 
                                          text_length: [len(text)]*batch_size,
                                          keep_prob: 1.0})[0] 
            pred = [id_to_word[i] for i in answer if i != pad]
            Predicted_Summaries.append(pred)


INFO:tensorflow:Restoring parameters from ./Checkpoint/model.ckpt


In [46]:
Predicted_Summaries

[['great', 'coffee'],
 ['great', 'product'],
 ['great', 'tea'],
 ['great', 'pecans'],
 ['great', 'standby'],
 ['green', 'tea'],
 ['great', 'product'],
 ['great', 'pecans'],
 ['great', 'pecans'],
 ['great', 'tea'],
 ['great', 'snack'],
 ['these', 'are', 'wonderful'],
 ['abit', 'vegetable', 'beef', 'rinds'],
 ['great', 'product'],
 ['great', 'tasting'],
 ['great', 'product'],
 ['she', 'loved', 'it'],
 ['great', 'for', 'making'],
 ['works', 'well'],
 ['great', 'taste'],
 ['great', 'product'],
 ['my', 'dog', 'loves', 'this', 'stuff'],
 ['great', 'product'],
 ['great', 'tasting'],
 ['spearmint', 'gum'],
 ['pamela', 's', 'bread', 'mix'],
 ['great', 'product'],
 ['best', 'tasting', 'ever'],
 ['great', 'product'],
 ['great', 'product'],
 ['great', 'product'],
 ['great', 'standby'],
 ['great', 'product'],
 ['great', 'tea'],
 ['great', 'product'],
 ['great', 'product'],
 ['i', 'expected'],
 ['great', 'pecans'],
 ['cat', 'food'],
 ['cats', 'love', 'this', 'stuff'],
 ['worthless', 'the', 'bounty',

In [47]:
rouge_n = 1
ngram_o = []
for sent in summary_tests:
    ngram_set_o = set()
    text_length = len(sent)
    max_index_ngram_start = text_length - rouge_n
    for i in range(max_index_ngram_start + 1):
        ngram_set_o.add(tuple(sent[i:i + rouge_n]))
    ngram_o.append(ngram_set_o)

In [48]:
ngram_p = []
for sent in Predicted_Summaries:
    ngram_set_p = set()
    text_length = len(sent)
    max_index_ngram_start = text_length - rouge_n
    for i in range(max_index_ngram_start + 1):
        ngram_set_p.add(tuple(sent[i:i + rouge_n]))
    ngram_p.append(ngram_set_p)

In [49]:
f1_scores = []
precisions = []
recalls = []

for i in range(len(ngram_o)):
    overlap = ngram_o[i].intersection(ngram_p[i])
    overlap_n = len(overlap)
    o_n = len(ngram_o[i])
    p_n = len(ngram_p[i])
    
    if p_n == 0:
        precision = 0.0
    else:
        precision = overlap_n / p_n

    if o_n == 0:
        recall = 0.0
    else:
        recall = overlap_n / o_n

    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
    
    f1_scores.append(f1_score)
    precisions.append(precision)
    recalls.append(recall)

In [50]:
avg_rouge_f1 = np.mean(f1_scores)
avg_rouge_precision = np.mean(precisions)
avg_rouge_recall = np.mean(recalls)

In [51]:
print(summary_tests)



In [52]:
print(Predicted_Summaries)

[['great', 'coffee'], ['great', 'product'], ['great', 'tea'], ['great', 'pecans'], ['great', 'standby'], ['green', 'tea'], ['great', 'product'], ['great', 'pecans'], ['great', 'pecans'], ['great', 'tea'], ['great', 'snack'], ['these', 'are', 'wonderful'], ['abit', 'vegetable', 'beef', 'rinds'], ['great', 'product'], ['great', 'tasting'], ['great', 'product'], ['she', 'loved', 'it'], ['great', 'for', 'making'], ['works', 'well'], ['great', 'taste'], ['great', 'product'], ['my', 'dog', 'loves', 'this', 'stuff'], ['great', 'product'], ['great', 'tasting'], ['spearmint', 'gum'], ['pamela', 's', 'bread', 'mix'], ['great', 'product'], ['best', 'tasting', 'ever'], ['great', 'product'], ['great', 'product'], ['great', 'product'], ['great', 'standby'], ['great', 'product'], ['great', 'tea'], ['great', 'product'], ['great', 'product'], ['i', 'expected'], ['great', 'pecans'], ['cat', 'food'], ['cats', 'love', 'this', 'stuff'], ['worthless', 'the', 'bounty', 'candy'], ['great', 'coffee'], ['great'

In [53]:
#general average of selected number of generated summaries against corresponding summaries from data set
print(avg_rouge_f1)
print(avg_rouge_precision)
print(avg_rouge_recall)

0.0637986449671
0.1025
0.0486507936508


In [54]:
# Examples
example_n = 5
ex_summary = []
ex_text = []
pred_summary = []

for i in summary_tests[:example_n]:
    ex_summary.append(" ".join(i))

for j in text_tests[:example_n]:
    ex_text.append(" ".join(j))
    
for k in Predicted_Summaries[:example_n]:
    pred_summary.append(" ".join(k))

In [55]:
examples = pd.DataFrame({
    'Summary': ex_summary,
    'Predicted Summary': pred_summary,
    'Text': ex_text})

In [56]:
pd.set_option('display.max_colwidth', 0)
examples

Unnamed: 0,Summary,Predicted Summary,Text
0,used to love now disappointed,great coffee,i used to love this decaf was my favorite great flavor non bitter but good flavor bordering on strong which i liked the last carton i received through amazon com is weak i can only use the small cup setting to make it okay i used to use the larger setting and add a little extra water has the amount of coffee in the k cups been reduced if the next batch is the same i will be looking for another brand
1,good but still having gas,great product,my twins took this formula since they were born sometimes they having gas with this kind of formula sensitive i think they are no miracles formula
2,cinnamon way too strong,great tea,i love peach tea and anything peach flavored but this has has a cinnamon flavor that is too strong i wanted so badly to give it 5 stars but i would not be honest if i did i do not like the cinnamon flavor to me peach and cinnamon do not belong together peach is summer and cinnamon is winter this tea might be better in the winter as hot tea but then again not because peach is too summery cinnamon would go better with apple flavor as soon as i finish this box i will not buy it again <br ><br >the cinnamon is way too strong it made me sick at my stomach and the aftertaste is horrible
3,flavorful seeds,great pecans,i am absolutely thrilled to have 2 lbs of the frontier natural products whole cumin seed 16 ounce bags pack of 2 that i purchased directly from amazon they are delicious and flavorful since these are the whole seeds they should remain tasty for years i also have enough extra to make spice mixes for gifts
4,a nice mellow wash of turmeric,great standby,firstly i may be reviewing a slightly different nguan soon curry product than what is being sold i normally buy this at a wonderful thai owned asian market in lexington my bottle from nguan soon is the same as the picture except the labeling languages are slightly different i believe there is vietnamese on mine instead as it says b t curry right under curry powder after googling the word and translating from vietnamese to french to english it appears to mean flour or powder so that is my cautionary notice should you buy this powder be aware that my bottle lists as its ingredients the following turmeric coriander cumin chili salt and pepper it does not mention garlic at all on my bottle but looks identical color wise and the graphic appears the same <br ><br >i am not entirely sure how to describe the taste other than to compare it to other curries this particular variety lacks the bite of the other nguan soon curries or other branded thai curry pastes that you may be familiar with from shopping in your local asian market more than likely you can buy this there as well it is a mellower darker colored brownish curry this is not your average madras yellow curry not that there is anything wrong with madras curry i keep both of these on my shelves and enjoy several other varieties <br ><br >my wife complains about the high turmeric smell of this one where she is not bothered by the madras variety overly much however the distinctive brown color this has over yellow leads me to wonder if the ratio of another ingredient is greater than the turmeric even though it is not as persistent a smell i use this in sandwiches quite often and just about anything else for its mellow contrast to the standard yellow curry i normally have access to locally i am probably the closest anyone in this state comes to be being lister from red dwarf while still being alive and lacking the curry stained clothing happy eating


In [57]:
rouge_df = pd.DataFrame({
    'Average F1':avg_rouge_f1,
    'Average Precision': avg_rouge_precision,
    'Average Recalls': avg_rouge_recall}, index=[''])

In [58]:
rouge_df

Unnamed: 0,Average F1,Average Precision,Average Recalls
,0.06,0.1,0.05


In [None]:
# Set hyperparameters
epochs = 5 ## We initially wanted to run for 10+ epochs, but couldn't afford to do to extremely long traning time
batch_size = 32
rnn_size = 128
num_layers = 1
learning_rate = 0.008
keep_probability = 0.8
direction = 2

In [None]:
# Set hyperparameters, wikihow
epochs = 3 ## We initially wanted to run for 10+ epochs, but couldn't afford to do to extremely long traning time
batch_size = 32
rnn_size = 128
num_layers = 1
learning_rate = 0.008
keep_probability = 0.8
direction = 2

In [65]:
hyperparam_df = pd.DataFrame({
    'Amazon Review':[5, 32, 128, 1, 0.008, 0.8, 2],
    'Wikihow': [3, 32, 128, 1, 0.008, 0.8, 2]}, 
    index = ['Epochs', 'Batch size', 'RNN Size', 
             'Num of Layers', 'Learning Rate', 
             'Keep Probability', 'Direction']).astype(int)

In [66]:
hyperparam_df

Unnamed: 0,Amazon Review,Wikihow
Epochs,5,3
Batch size,32,32
RNN Size,128,128
Num of Layers,1,1
Learning Rate,0,0
Keep Probability,0,0
Direction,2,2


In [74]:
# Other models from research papers, and their ROUGE metrics
paper_ROUGE = pd.DataFrame({
    'ROUGE-1':[39.2, 39.6, 35.46, 37.86, 38.30, 41.16, 39.87]}, 
    index = ['Lead-3 (Nallapati et al., 2017)', 'SummaRuNNer (Nallapati et al., 2017)',
            'words-lvt2k-temp-att (Nallapati et al., 2016)', 'ML, no intra-attention (Paulus et al., 2017)',
             'ML, with intra-attention (Paulus et al., 2017)', 'RL, with intra-attention (Paulus et al., 2017)', 
             'ML+RL, with intra-attention (Paulus et al., 2017)'])

In [75]:
paper_ROUGE

Unnamed: 0,ROUGE-1
"Lead-3 (Nallapati et al., 2017)",39.2
"SummaRuNNer (Nallapati et al., 2017)",39.6
"words-lvt2k-temp-att (Nallapati et al., 2016)",35.46
"ML, no intra-attention (Paulus et al., 2017)",37.86
"ML, with intra-attention (Paulus et al., 2017)",38.3
"RL, with intra-attention (Paulus et al., 2017)",41.16
"ML+RL, with intra-attention (Paulus et al., 2017)",39.87
