## Data Processing: Loading in reviews data and process with GloVe

In [16]:
##Import pandas and set display option
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

In [2]:
##Load wikihow dataset
df2 = pd.read_csv("wikihowSep.csv")
df2.head(5)

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [3]:
text_summary_wikihow = df2.filter(items =['headline','text'])
text_summary_wikihow = text_summary_wikihow.dropna()
text_summary_wikihow = text_summary_wikihow.drop_duplicates()
text_summary_wikihow = text_summary_wikihow.rename(columns = {'headline':'Summary', 'text':'Text'})

In [4]:
#get an idea of the summaries and text columns 
text_summary_wikihow.head(5)

Unnamed: 0,Summary,Text
0,\nSell yourself first.,"Before doing anything else, stop and sum up y..."
1,\nRead the classics before 1600.,Reading the classics is the very first thing ...
2,\nJoin online artist communities.,Depending on what scale you intend to sell yo...
3,\nMake yourself public.,Get yourself out there as best as you can by ...
4,\nBlog about your artwork.,"Given the hundreds of free blogging websites,..."


In [5]:
# https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [6]:
##Data cleaning to remove unwanted symbols
import re
def clean_text(sentences):
    clean=[]
    
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''    
    for sentence in sentences:
        for key in contractions.keys():
            sentence = sentence.lower().replace(key, contractions[key])
        sentence = sentence.lower()
        sentence = re.sub(r'https?:\/\/.*[\r\n]*', '', sentence, flags=re.MULTILINE)
        sentence = re.sub(r'\<a href', ' ', sentence)
        sentence = re.sub(r'&amp;', '', sentence) 
        sentence = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', sentence)
        sentence = re.sub(r'<br />', ' ', sentence)
        sentence = re.sub(r'\'', ' ', sentence)
        
        words = [word for word in sentence.split()]
        clean.append(words)
    
    return clean

In [7]:
Texts_wikihow = clean_text(text_summary_wikihow['Text'])
Summary_wikihow = clean_text(text_summary_wikihow['Summary'])

In [8]:
## Check the size of the cleaned data
len(Summary_wikihow)

1375354

In [9]:
## Verify that one entry of the clearned data
Texts_wikihow[1]

['reading',
 'the',
 'classics',
 'is',
 'the',
 'very',
 'first',
 'thing',
 'you',
 'have',
 'to',
 'do',
 'to',
 'be',
 'well',
 'read',
 'if',
 'you',
 'want',
 'to',
 'build',
 'a',
 'solid',
 'foundation',
 'for',
 'your',
 'understanding',
 'of',
 'the',
 'books',
 'you',
 'read',
 'then',
 'you',
 'cannot',
 'avoid',
 'some',
 'of',
 'the',
 'earliest',
 'plays',
 'poems',
 'and',
 'oral',
 'tales',
 'ever',
 'written',
 'down',
 'remember',
 'that',
 'the',
 'novel',
 'did',
 'not',
 'really',
 'get',
 'popular',
 'until',
 'the',
 '18th',
 'century',
 'so',
 'you',
 'will',
 'not',
 'find',
 'novels',
 'on',
 'this',
 'list',
 'without',
 'reading',
 'the',
 'poetry',
 'of',
 'homer',
 'or',
 'the',
 'plays',
 'of',
 'sophocles',
 'you',
 'will',
 'not',
 'be',
 'able',
 'to',
 'call',
 'yourself',
 'well',
 'read',
 'here',
 's',
 'a',
 'list',
 'to',
 'get',
 'you',
 'started',
 'the',
 'epic',
 'of',
 'gilgamesh',
 'unknown',
 'author',
 '18th',
 '–',
 '17th',
 'century',


In [11]:
Summary_wikihow[1]

['read', 'the', 'classics', 'before', '1600']

In [12]:
##Function to count word frequency
def count_words(count_dict, text):
    for sentence in text:
        for word in sentence:
            if word not in count_dict:
                count_dict[word] = 1
            else:
                count_dict[word] += 1

In [13]:
## Apply functino above to generate the word frequency for each word and get vocab size in dataset
word_counts = {}

count_words(word_counts, Summary_wikihow)
count_words(word_counts, Texts_wikihow)
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 268289


In [14]:
## Understand how long summaries and texts generally are, so that we can exclude outliers later
def create_lengths(text):
    lengths = []
    for sentence in text:
        lengths.append(len(sentence))
    return pd.DataFrame(lengths, columns=['counts'])

In [17]:
lengths_summaries = create_lengths(Summary_wikihow)
lengths_texts = create_lengths(Texts_wikihow)

print("Summaries:")
print(lengths_summaries.describe())
print()
print("Texts:")
print(lengths_texts.describe())

Summaries:
          counts
count 1375354.00
mean        7.44
std         6.17
min         0.00
25%         4.00
50%         6.00
75%         9.00
max      2952.00

Texts:
          counts
count 1375354.00
mean       68.99
std        54.85
min         0.00
25%        30.00
50%        57.00
75%        94.00
max      2849.00


In [26]:
# Load Glove
#Can select from 50, 100, 200 and 300 for dimensions
import numpy as np
def parse_glove(dimension):
    filename = "glove.6B.{:d}d.txt".format(dimension)
    embeddings_index={}
    file = open(filename,'r')
    for line in file.readlines():
        row=line.strip().split(' ')
        word = row[0]
        embedding = np.asarray(row[1:], dtype='float32')
        embeddings_index[word] = embedding
    print('Word embeddings:', len(embeddings_index))
    file.close()
    return embeddings_index

In [27]:
embedding_dim = 200 #set dimension to 200
embeddings_index = parse_glove(embedding_dim)

Word embeddings: 400000


In [28]:
## Find the number of words that are missing from our data from GloVe, and appears more often than our set threshold
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing: {}%".format(missing_ratio))

Number of words missing from GloVe: 8782
Percent of words that are missing: 3.27%


In [29]:
## create mapping to map each word to an integer ID
word_to_id = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        word_to_id[word] = value
        value += 1

## Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>","<EOS>","<GO>"]   

## Add tokens to vocab
for code in codes:
    word_to_id[code] = len(word_to_id)

## Create mappingn for ID to word
id_to_word = {}
for word, value in word_to_id.items():
    id_to_word[value] = word


print("Number of unique words:", len(word_counts))
print("Number of words we will use:", len(word_to_id))

Number of unique words: 268289
Number of words we will use: 112898


In [30]:
## function to find nearest neighbor for unknown words
W = list(embeddings_index.values())
vocab = list(embeddings_index.keys())
def nearest_neighbor(word):
    v= embeddings_index[word]
    dot_product = np.dot(W,v)
    A = np.sqrt(np.sum(np.square(v),0))
    B = np.sqrt(np.sum(np.square(W),1))
    denominator = np.multiply(A,B)
    cosine_similarities = np.divide(dot_product,denominator)
    v_sim = W[np.argmax(cosine_similarities)]
    for x in range(0,len(W)):
        if np.array_equal(W[x],np.asarray(v_sim)):
            return vocab[x]

In [31]:
##Createa a reduced embedding dictionary
word_embedding_matrix = np.zeros((len(word_to_id), embedding_dim), dtype=np.float32)
for word, i in word_to_id.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in GloVe, we start with a randomized embedding
        rand_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = rand_embedding
        word_embedding_matrix[i] = rand_embedding
## We also attempted to use the nearest neighbor for unknown words (function above)
## We found this reduces speed rather significantly, and given the low % of NNs, we proceeded with the randomized embedding

In [32]:
## Convert words to IDs (we do not convert directly to embeddings to increase the performance)
def convert_to_ids(text, word_n, unk_n, eos=False): 
    ids = []
    for sentence in text:
        sentence_ids = []
        for word in sentence:
            word_n += 1
            if word in word_to_id:
                sentence_ids.append(word_to_id[word])
            else:
                sentence_ids.append(word_to_id["<UNK>"])
                unk_n += 1
        if eos:
            sentence_ids.append(word_to_id["<EOS>"])
        ids.append(sentence_ids)
    return ids, word_n, unk_n

In [33]:
## Apply the convert_to_ids function
word_n = 0
unk_n = 0

id_summaries, word_n, unk_n_sum = convert_to_ids(Summary_wikihow, word_n, unk_n)
id_texts, word_n, unk_n_text = convert_to_ids(Texts_wikihow, word_n, unk_n, eos=True)
unk_percent = round((unk_n_sum+unk_n_text)/word_n,4)*100

print("Percent of UNK: {}%".format(unk_percent))

Percent of UNK: 0.27%


In [34]:
def count_unk(text):
    unk_n = text.count(word_to_id["<UNK>"])
    return unk_n

In [35]:
# Sort the sentences by length to reduce padding
sorted_id_summaries = []
sorted_id_texts = []
max_text_length = 100 #75% percentile
min_summary_length = 2 #want to produce summaries more than 2 words
min_text_length = 10 #ensure meaningful text worth summarizing
max_unk_text = 2
max_unk_summary = 0

for length in range(min(lengths_texts.counts), max_text_length): 
    for i, words in enumerate(id_summaries):
        if (len(id_summaries[i]) >= min_summary_length and
            len(id_texts[i])>= min_text_length and
            len(id_texts[i])<= max_text_length and
            count_unk(id_summaries[i])<=max_unk_summary and
            count_unk(id_texts[i])<=max_unk_text and
            len(id_texts[i]) == length):
            
            sorted_id_summaries.append(id_summaries[i])
            sorted_id_texts.append(id_texts[i])
            
print(len(sorted_id_summaries))


963337


In [36]:
## Saving the convered IDs for texts and summaries so we don't have to rerun data processing again
np.save("wikihow_id_summaries",sorted_id_summaries)
np.save("wikihow_id_texts",sorted_id_texts)

In [37]:
# id_summaries = np.load('wikihow_id_summaries.npy')
# id_texts = np.load('wikihow_id_texts.npy')

## Model Building

In [38]:
#!pip install --upgrade tensorflow
import tensorflow as tf
import time
from tensorflow.python.layers.core import Dense
from tensorflow.python.ops.rnn_cell_impl import _zero_state_tensors
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.10.0


In [39]:
## function to create tf.placeholders for hyper parameters and model inputs
def model_inputs():
    
    input_data = tf.placeholder(tf.int32, [None, None], name='input')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')
    summary_length = tf.placeholder(tf.int32, (None,), name='summary_length')
    max_summary_length = tf.reduce_max(summary_length, name='max_dec_len')
    text_length = tf.placeholder(tf.int32, (None,), name='text_length')

    return input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length

In [40]:
## Function to process input for decoding input (adding <GO> to begining of batch)
def process_decoding_input(target_data, word_to_id, batch_size):
    
    ending = tf.strided_slice(target_data, [0, 0], [batch_size, -1], [1, 1])
    dec_input = tf.concat([tf.fill([batch_size, 1], word_to_id['<GO>']), ending], 1)

    return dec_input

In [41]:
## Create encoding layer function - can choose between bi-direction or one direction LSTM
def encoding_layer(rnn_size, sequence_length, num_layers, rnn_inputs, keep_prob, direction):
    
    if direction == 1:
        with tf.name_scope("RNN_Encoder_Cell_1D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    lstm = tf.contrib.rnn.LSTMCell(rnn_size)

                    drop = tf.contrib.rnn.DropoutWrapper(lstm, 
                                                         input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.dynamic_rnn(drop, 
                                                              rnn_inputs,
                                                              sequence_length,
                                                              dtype=tf.float32)

            return enc_output, enc_state
        
        
    if direction == 2:
        with tf.name_scope("RNN_Encoder_Cell_2D"):
            for layer in range(num_layers):
                with tf.variable_scope('encoder_{}'.format(layer)):
                    cell_fw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_fw = tf.contrib.rnn.DropoutWrapper(cell_fw, 
                                                            input_keep_prob = keep_prob)

                    cell_bw = tf.contrib.rnn.LSTMCell(rnn_size)
                    cell_bw = tf.contrib.rnn.DropoutWrapper(cell_bw, 
                                                            input_keep_prob = keep_prob)

                    enc_output, enc_state = tf.nn.bidirectional_dynamic_rnn(cell_fw, 
                                                                            cell_bw, 
                                                                            rnn_inputs,
                                                                            sequence_length,
                                                                            dtype=tf.float32)
            enc_output = tf.concat(enc_output,2)
            return enc_output, enc_state[0]

In [42]:
## Create training logits
def training_decoding_layer(dec_embed_input, summary_length, dec_cell, initial_state, output_layer, 
                            vocab_size, max_summary_length):
    
    training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=dec_embed_input,
                                                        sequence_length=summary_length,
                                                        time_major=False)

    training_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                       training_helper,
                                                       initial_state,
                                                       output_layer) 

    training_logits, _ ,_ = tf.contrib.seq2seq.dynamic_decode(training_decoder,
                                                           output_time_major=False,
                                                           impute_finished=True,
                                                           maximum_iterations=max_summary_length)    
    
    return training_logits

In [43]:

def inference_decoding_layer(embeddings, start_token, end_token, dec_cell, initial_state, output_layer,
                             max_summary_length, batch_size):
    
    start_tokens = tf.tile(tf.constant([start_token], dtype=tf.int32), [batch_size], name='start_tokens')
    
    inference_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(embeddings,
                                                                start_tokens,
                                                                end_token)
                
    inference_decoder = tf.contrib.seq2seq.BasicDecoder(dec_cell,
                                                        inference_helper,
                                                        initial_state,
                                                        output_layer)
                
    inference_logits, _ ,_  = tf.contrib.seq2seq.dynamic_decode(inference_decoder,
                                                            output_time_major=False,
                                                            impute_finished=True,
                                                            maximum_iterations=max_summary_length)
    
    
    return inference_logits

In [44]:
## Decoding layer with attention (Bahdanau) for training and inference
def decoding_layer(dec_embed_input, embeddings, enc_output, enc_state, vocab_size, text_length, summary_length, 
                   max_summary_length, rnn_size, word_to_id, keep_prob, batch_size, num_layers, direction):
    
    with tf.name_scope("RNN_Decoder_Cell"):
        for layer in range(num_layers):
            with tf.variable_scope('decoder_{}'.format(layer)):
                lstm = tf.contrib.rnn.LSTMCell(rnn_size,
                                           initializer=tf.random_uniform_initializer(-0.1, 0.1, seed=2))
                dec_cell = tf.contrib.rnn.DropoutWrapper(lstm, input_keep_prob = keep_prob)
    
    output_layer = Dense(vocab_size,
                         kernel_initializer = tf.truncated_normal_initializer(mean = 0.0, stddev=0.1))
    
    attn_mech = tf.contrib.seq2seq.BahdanauAttention(rnn_size,
                                                  enc_output,
                                                  text_length,
                                                  normalize=False,
                                                  name='BahdanauAttention')

    with tf.name_scope("Attention_Wrapper"):
        dec_cell = tf.contrib.seq2seq.AttentionWrapper(dec_cell,attn_mech,rnn_size)
            
        
    initial_state = dec_cell.zero_state(batch_size,dtype=tf.float32)
    initial_state = initial_state.clone(cell_state = enc_state)
    
    with tf.variable_scope("decode"):
        training_logits = training_decoding_layer(dec_embed_input, 
                                                  summary_length, 
                                                  dec_cell, 
                                                  initial_state,
                                                  output_layer,
                                                  vocab_size, 
                                                  max_summary_length)
    with tf.variable_scope("decode", reuse=True):
        inference_logits = inference_decoding_layer(embeddings,  
                                                    word_to_id['<GO>'], 
                                                    word_to_id['<EOS>'],
                                                    dec_cell, 
                                                    initial_state, 
                                                    output_layer,
                                                    max_summary_length,
                                                    batch_size)

    return training_logits, inference_logits

In [45]:
def seq2seq_model(input_data, target_data, keep_prob, text_length, summary_length, max_summary_length, 
                  vocab_size, rnn_size, num_layers, word_to_id, batch_size, direction):
    
    embeddings = word_embedding_matrix
    
    enc_embed_input = tf.nn.embedding_lookup(embeddings, input_data)
    enc_output, enc_state = encoding_layer(rnn_size, text_length, num_layers, enc_embed_input, keep_prob, direction)
    
    dec_input = process_decoding_input(target_data, word_to_id, batch_size)
    dec_embed_input = tf.nn.embedding_lookup(embeddings, dec_input)
    
    training_logits, inference_logits  = decoding_layer(dec_embed_input, 
                                                        embeddings,
                                                        enc_output,
                                                        enc_state, 
                                                        vocab_size, 
                                                        text_length, 
                                                        summary_length, 
                                                        max_summary_length,
                                                        rnn_size, 
                                                        word_to_id, 
                                                        keep_prob, 
                                                        batch_size,
                                                        num_layers, direction)
    
    return training_logits, inference_logits

In [46]:
## Create padding so each batch has the same sentence length
def pad_sentence_batch(sentence_batch):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [word_to_id['<PAD>']] * (max_sentence - len(sentence)) for sentence in sentence_batch]

In [47]:
def get_batches(summaries, texts, batch_size):
    for batch_i in range(0, len(texts)//batch_size):
        start_i = batch_i * batch_size
        summaries_batch = summaries[start_i:start_i + batch_size]
        texts_batch = texts[start_i:start_i + batch_size]
        pad_summaries_batch = np.array(pad_sentence_batch(summaries_batch))
        pad_texts_batch = np.array(pad_sentence_batch(texts_batch))
        
        # Need the lengths for the _lengths parameters
        pad_summaries_lengths = []
        for summary in pad_summaries_batch:
            pad_summaries_lengths.append(len(summary))
        
        pad_texts_lengths = []
        for text in pad_texts_batch:
            pad_texts_lengths.append(len(text))
        
        yield pad_summaries_batch, pad_texts_batch, pad_summaries_lengths, pad_texts_lengths

In [48]:
# Set hyperparameters
epochs = 3 ## We initially wanted to run for 10+ epochs, but couldn't afford to do to extremely long traning time
batch_size = 32
rnn_size = 128
num_layers = 1
learning_rate = 0.008
keep_probability = 0.8
direction = 2

In [49]:
# Build the graph
train_graph = tf.Graph()
with train_graph.as_default():
    
    # Load the model inputs    
    input_data, targets, lr, keep_prob, summary_length, max_summary_length, text_length = model_inputs()

    # Create the training and inference logits
    training_logits, inference_logits = seq2seq_model(tf.reverse(input_data, [-1]),
                                                      targets, 
                                                      keep_prob,   
                                                      text_length,
                                                      summary_length,
                                                      max_summary_length,
                                                      len(word_to_id)+1,
                                                      rnn_size, 
                                                      num_layers, 
                                                      word_to_id,
                                                      batch_size, 
                                                      direction)
    
    # Create tensors for the training logits and inference logits
    training_logits = tf.identity(training_logits.rnn_output, 'logits')
    inference_logits = tf.identity(inference_logits.sample_id, name='predictions')
    
    # Create the weights for sequence_loss
    masks = tf.sequence_mask(summary_length, max_summary_length, dtype=tf.float32, name='masks')

    with tf.name_scope("cost"):
        # Loss function
        cost = tf.contrib.seq2seq.sequence_loss(training_logits,targets,masks)

        # Optimizer
        optimizer = tf.train.AdamOptimizer(learning_rate)

        # Gradient Clipping
        gradients = optimizer.compute_gradients(cost)
        capped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(capped_gradients)
print("Success")

Instructions for updating:
seq_dim is deprecated, use seq_axis instead
Instructions for updating:
batch_dim is deprecated, use batch_axis instead
Success


In [50]:
## Create a training subset
start = 0
end = start + 10000
Summaries_short = id_summaries[start:end]
Texts_short = id_texts[start:end]


In [45]:
# Train the Model
learning_rate_decay = 0.95
min_learning_rate = 0.0005
display_step = 10 # Check training loss 
stop_early = 0 
stop = 3 # If the update loss does not decrease in 3 consecutive update checks, stop training
per_epoch = 3 # Make 3 update checks per epoch
update_check = (len(Texts_short)//batch_size//per_epoch)-1

update_loss = 0 
batch_loss = 0
summary_update_loss = [] # Record the update losses for saving improvements in the model


with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epochs+1):
        update_loss = 0
        batch_loss = 0
        for batch_i, (summaries_batch, texts_batch, summaries_lengths, texts_lengths) in enumerate(
                get_batches(Summaries_short, Texts_short, batch_size)):
            start_time = time.time()
            _, loss = sess.run(
                [train_op, cost],
                {input_data: texts_batch,
                 targets: summaries_batch,
                 lr: learning_rate,
                 summary_length: summaries_lengths,
                 text_length: texts_lengths,
                 keep_prob: keep_probability})

            batch_loss += loss
            update_loss += loss
            end_time = time.time()
            batch_time = end_time - start_time

            if batch_i % display_step == 0 and batch_i > 0:
                print('Epoch {:>3}/{} Batch {:>4}/{} - Loss: {:>6.3f}, Seconds: {:>4.2f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(Texts_short) // batch_size, 
                              batch_loss / display_step, 
                              batch_time*display_step))
                batch_loss = 0

            if batch_i % update_check == 0 and batch_i > 0:
                print("Average loss for this update:", round(update_loss/update_check,3))
                summary_update_loss.append(update_loss)
                
                # If the update loss is at a new minimum, save the model
                if update_loss <= min(summary_update_loss):
                    print('Improved Result') 
                    stop_early = 0
                    saver = tf.train.Saver() 
                    save_path = saver.save(sess, "./Checkpoint/model_wiki.ckpt")

                else:
                    print("No Improvement.")
                    stop_early += 1
                    if stop_early == stop:
                        break
                update_loss = 0
            
                    
        # Reduce learning rate, but not below its minimum value
        learning_rate *= learning_rate_decay
        if learning_rate < min_learning_rate:
            learning_rate = min_learning_rate
        
        if stop_early == stop:
            print("Stopping Training.")
            break

Epoch   1/3 Batch   10/312 - Loss:  5.821, Seconds: 571.02
Epoch   1/3 Batch   20/312 - Loss:  3.311, Seconds: 607.27
Epoch   1/3 Batch   30/312 - Loss:  2.466, Seconds: 1186.20
Epoch   1/3 Batch   40/312 - Loss:  3.103, Seconds: 468.10
Epoch   1/3 Batch   50/312 - Loss:  2.885, Seconds: 411.86
Epoch   1/3 Batch   60/312 - Loss:  3.160, Seconds: 1419.31
Epoch   1/3 Batch   70/312 - Loss:  3.190, Seconds: 870.37
Epoch   1/3 Batch   80/312 - Loss:  2.618, Seconds: 653.26
Epoch   1/3 Batch   90/312 - Loss:  2.357, Seconds: 543.66
Epoch   1/3 Batch  100/312 - Loss:  2.391, Seconds: 489.86
Average loss for this update: 3.104
New Record!
Epoch   1/3 Batch  110/312 - Loss:  2.461, Seconds: 359.58
Epoch   1/3 Batch  120/312 - Loss:  2.027, Seconds: 277.04
Epoch   1/3 Batch  130/312 - Loss:  1.968, Seconds: 204.20
Epoch   1/3 Batch  140/312 - Loss:  2.604, Seconds: 324.39
Epoch   1/3 Batch  150/312 - Loss:  2.453, Seconds: 289.57
Epoch   1/3 Batch  160/312 - Loss:  2.638, Seconds: 323.88
Epoch 

In [51]:
## Pull a random sentence from the unseen text in data and see the prediction
random = np.random.randint(end,len(Texts_wikihow))
input_sentence = Texts_wikihow[random]
text = [word_to_id.get(word, word_to_id['<UNK>']) for word in input_sentence]

checkpoint = "./Checkpoint/model_wiki.ckpt"

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    answer_logits = sess.run(logits, {input_data: [text]*batch_size, 
                                      summary_length: [np.random.randint(5,8)], 
                                      text_length: [len(text)]*batch_size,
                                      keep_prob: 1.0})[0] 

# Remove the padding from output
pad = word_to_id["<PAD>"] 

print('Original Text:', input_sentence)
print('  Input Words: {}'.format(" ".join([id_to_word[i] for i in text])))

print('\Generated Summary')
print('  Response Words: {}'.format(" ".join([id_to_word[i] for i in answer_logits if i != pad])))

INFO:tensorflow:Restoring parameters from ./Checkpoint/model_wiki.ckpt
Original Text: ['you', 'are', 'still', 'best', 'friends', 'technically', 'so', 'do', 'not', 'let', 'old', 'habits', 'change', 'just', 'upgrade', 'them', 'when', 'you', 'have', 'that', 'date', 'officially', 'just', 'be', 'yourself', 'they', 'fell', 'in', 'love', 'with', 'their', 'best', 'friend', 'not', 'a', 'stranger', 'please', 'do', 'keep', 'that', 'in', 'mind', 'if', 'you', 'have', 'broken', 'the', 'touch', 'barrier', 'by', 'this', 'point', 'feel', 'free', 'to', 'show', 'some', 'affection', 'of', 'course', 'this', 'all', 'depends', 'on', 'you', 'and', 'your', 'more', 'than', 'friend', 's', 'level', 'of', 'comfort', 'with', 'one', 'another', 'and', 'physical', 'contact', 'itself']
  Input Words: you are still best friends technically so do not let old habits change just upgrade them when you have that date officially just be yourself they fell in love with their best friend not a stranger please do keep that in mi

## Rouge Metrics

In [52]:
# install the library
!pip install rouge

[31mdistributed 1.21.8 requires msgpack, which is not installed.[0m
[31mtimestring 1.6.2 has requirement pytz==2013b, but you'll have pytz 2018.4 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [57]:
summary_sentence = Summary_wikihow[random]
summary_text = [word_to_id.get(word, word_to_id['<UNK>']) for word in summary_sentence]
original_summary = " ".join([id_to_word[i] for i in summary_text])
generated_summary = " ".join([id_to_word[i] for i in answer_logits if i != pad])
print(original_summary)
print(generated_summary)
# original_summary = "this tea is my new alternative after jasmine green tea it is so smooth and light ican drink it anytime"
# generated_summary = "great tea"

start out slow
make sure the shares of


In [58]:
from rouge import Rouge
rouge_calculator = Rouge()
scores = rouge_calculator.get_scores(original_summary, generated_summary)

In [59]:
# Pretty-print the results
#note that this is just for ONE generated summary against corresponding summary from dataset 
for metric in scores:
    for metric_name, metric_vals in metric.items():
        print("\nMetric Name: {}\nPrecision: {}\nRecall: {}\nF-score: {}".format(
                metric_name, metric_vals['p'], metric_vals['r'], metric_vals['f']))


Metric Name: rouge-1
Precision: 0.0
Recall: 0.0
F-score: 0.0

Metric Name: rouge-2
Precision: 0.0
Recall: 0.0
F-score: 0.0

Metric Name: rouge-l
Precision: 0.0
Recall: 0.0
F-score: 0.0


# ROUGE Metrics (Aggregate) 

In [60]:
## Select test texts and infer predicted summaries
size = 100
test_list = np.random.randint(10000,len(Texts_wikihow),size = size)
summary_tests = [Summary_wikihow[i] for i in test_list]
text_tests = [Texts_wikihow[i] for i in test_list]
Predicted_Summaries = []
checkpoint = "./Checkpoint/model_wiki.ckpt"
pad = word_to_id["<PAD>"] 

text_id_tests = []
for text in text_tests:
    ids = [word_to_id.get(word, word_to_id['<UNK>']) for word in text]
    text_id_tests.append(ids)



In [61]:
len(text_id_tests)

100

In [62]:
loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    loader = tf.train.import_meta_graph(checkpoint + '.meta')
    loader.restore(sess, checkpoint)

    input_data = loaded_graph.get_tensor_by_name('input:0')
    logits = loaded_graph.get_tensor_by_name('predictions:0')
    text_length = loaded_graph.get_tensor_by_name('text_length:0')
    summary_length = loaded_graph.get_tensor_by_name('summary_length:0')
    keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
    
    for text in text_id_tests:
        if len(text) == 0:
            Predicted_Summaries.append('')            
        else:
            answer = sess.run(logits, {input_data: [text]*batch_size, 
                                          summary_length: [np.random.randint(5,8)], 
                                          text_length: [len(text)]*batch_size,
                                          keep_prob: 1.0})[0] 
            pred = [id_to_word[i] for i in answer if i != pad]
            Predicted_Summaries.append(pred)


INFO:tensorflow:Restoring parameters from ./Checkpoint/model_wiki.ckpt


In [64]:
rouge_n = 1
ngram_o = []
for sent in summary_tests:
    ngram_set_o = set()
    text_length = len(sent)
    max_index_ngram_start = text_length - rouge_n
    for i in range(max_index_ngram_start + 1):
        ngram_set_o.add(tuple(sent[i:i + rouge_n]))
    ngram_o.append(ngram_set_o)

In [65]:
ngram_p = []
for sent in Predicted_Summaries:
    ngram_set_p = set()
    text_length = len(sent)
    max_index_ngram_start = text_length - rouge_n
    for i in range(max_index_ngram_start + 1):
        ngram_set_p.add(tuple(sent[i:i + rouge_n]))
    ngram_p.append(ngram_set_p)

In [66]:
f1_scores = []
precisions = []
recalls = []

for i in range(len(ngram_o)):
    overlap = ngram_o[i].intersection(ngram_p[i])
    overlap_n = len(overlap)
    o_n = len(ngram_o[i])
    p_n = len(ngram_p[i])
    
    if p_n == 0:
        precision = 0.0
    else:
        precision = overlap_n / p_n

    if o_n == 0:
        recall = 0.0
    else:
        recall = overlap_n / o_n

    f1_score = 2.0 * ((precision * recall) / (precision + recall + 1e-8))
    
    f1_scores.append(f1_score)
    precisions.append(precision)
    recalls.append(recall)

In [67]:
avg_rouge_f1 = np.mean(f1_scores)
avg_rouge_precision = np.mean(precisions)
avg_rouge_recall = np.mean(recalls)

In [103]:
print(summary_tests[:5])

[['use', 'action', 'verbs'], ['put', 'on', 'some', 'suspenders'], ['make', 'some', 'assumptions'], ['write', 'a', 'brief', 'introduction'], ['join', 'a', 'support', 'group']]


In [105]:
print(text_tests[:5])

[['edit', 'your', 'sentences', 'so', 'that', 'your', 'verbs', 'are', 'consistent', 'and', 'add', 'color', 'to', 'your', 'work', 'experience', 'all', 'verbs', 'should', 'be', 'in', 'the', 'same', 'tenses', 'write', 'about', 'previous', 'projects', 'in', 'the', 'past', 'tense', 'but', 'the', 'description', 'of', 'a', 'job', 'function', 'or', 'institution', 'in', 'the', 'present', 'tense', 'all', 'verbs', 'should', 'be', 'in', 'the', 'active', 'not', 'passive', 'voice', 'use', 'verbs', 'that', 'match', 'or', 'are', 'synonymous', 'with', 'some', 'verbs', 'on', 'the', 'job', 'description', 'you', 'are', 'applying', 'for'], ['authentic', 'lederhosen', 'may', 'come', 'with', 'suspenders', 'but', 'if', 'you', 'buy', 'them', 'separately', 'try', 'to', 'find', 'some', 'that', 'match', 'the', 'color', 'of', 'your', 'breeches'], ['if', 'you', 'think', 'you', 'have', 'real', 'cause', 'to', 'suspect', 'your', 'spouse', 'then', 'start', 'with', 'the', 'assumption', 'that', 'she', 'is', 'going', 'to',

In [107]:
print(Predicted_Summaries[:5])

[['go', 'to', 'the', 'doll'], ['make', 'sure', 'the', 'weather'], ['make', 'sure', 'your', 'hands'], ['make', 'sure', 'your', 'hands', 'and', 'the', 'morsel'], ['make', 'sure', 'the', 'shares', 'of', 'the', 'shares']]


In [73]:
#general average of selected number of generated summaries against corresponding summaries from data set
print(avg_rouge_f1)
print(avg_rouge_precision)
print(avg_rouge_recall)

0.0946427100116
0.129357142857
0.0839063107945


In [116]:
# Examples
example_n = 5
ex_summary = []
ex_text = []
pred_summary = []

for i in summary_tests[:example_n]:
    ex_summary.append(" ".join(i))

for j in text_tests[:example_n]:
    ex_text.append(" ".join(j))
    
for k in Predicted_Summaries[:example_n]:
    pred_summary.append(" ".join(k))

In [121]:
examples = pd.DataFrame({
    'Summary': ex_summary,
    'Predicted Summary': pred_summary,
    'Text': ex_text})

In [143]:
pd.set_option('display.max_colwidth', 0)
examples

Unnamed: 0,Summary,Predicted Summary,Text
0,use action verbs,go to the doll,edit your sentences so that your verbs are consistent and add color to your work experience all verbs should be in the same tenses write about previous projects in the past tense but the description of a job function or institution in the present tense all verbs should be in the active not passive voice use verbs that match or are synonymous with some verbs on the job description you are applying for
1,put on some suspenders,make sure the weather,authentic lederhosen may come with suspenders but if you buy them separately try to find some that match the color of your breeches
2,make some assumptions,make sure your hands,if you think you have real cause to suspect your spouse then start with the assumption that she is going to take some kind of precautions to remain undiscovered when cheating she is not going to send emails from the home computer or call from the home phone she is not going to claim to be working late and leave for a hotel rendezvous risking your calls going unanswered or being seen leaving work too early she will use normal routines and patterns that you are well used to and simply use that time to have the affair a sexual affair does not require much time or commitment the two of them meet in the parking lot hop into one car head for their room at the motel 9 for a half hour and are back in time for shopping she even comes home with purchases consistent with where they were supposed to be so if you are truly committed to finding the truth do this
3,write a brief introduction,make sure your hands and the morsel,if you come from a large family or if your grandparent had a lot of friends there is a chance that not everyone will know you as the grandchild keep your introduction very brief just a short sentence will suffice the introduction should simply let people know your name and your relation to the deceased
4,join a support group,make sure the shares of the shares,a support group can help you join with other people who have similar obsessive thoughts or fears a support group can offer encouragement support and friendship and can help with feelings of isolation ask your medical doctor or therapist if there are any local support groups that deal with obsessive thoughts


In [135]:
rouge_df = pd.DataFrame({
    'Average F1':avg_rouge_f1,
    'Average Precision': avg_rouge_precision,
    'Average Recalls': avg_rouge_recall}, index=[''])

In [136]:
rouge_df

Unnamed: 0,Average F1,Average Precision,Average Recalls
,0.09,0.13,0.08
