## Load Required Packages and Data

In [1]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

from cytoolz import concatv

# NLTK for NLP utils and corpora
import nltk,pprint
from nltk import word_tokenize

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

# Your code
# import rnnlm; reload(rnnlm)
# import rnnlm_test; reload(rnnlm_test)

  from ._conv import register_converters as _register_converters


In [28]:
# explore other data set
nltk.corpus.reuters.sents('training/9864')
# sentences = np.array(list(reuters.sents('training/9864')), dtype=object)
# sentences

[['U', '.', 'S', 'TREASURY', "'", 'S', 'MULFORD', 'REAFFIRMS', 'G', '-', '6', 'AGREEMENT', 'Treasury', 'Assistant', 'Secretary', 'David', 'Mulford', 'reaffirmed', 'U', '.', 'S', '.', 'backing', 'for', 'the', 'Paris', 'Agreement', 'among', 'six', 'industrial', 'nations', 'to', 'cooperate', 'closely', 'to', 'foster', 'exchange', 'rate', 'stability', 'around', 'current', 'levels', '.'], ['In', 'testimony', 'prepared', 'for', 'delivery', 'before', 'a', 'Senate', 'banking', 'subcommittee', ',', 'Mulford', 'said', 'there', 'was', 'broad', 'recognition', 'in', 'Paris', 'that', '"', 'further', 'substantial', 'exchange', 'rate', 'shifts', 'could', 'damage', 'growth', 'and', 'adjustment', 'prospects', '."'], ...]

### read nyt flat file

In [2]:
# need to remove quote?
# !tail -n 1000 data/nyt_structured_data.txt > data/nyt_test.txt
file = open('data/nyt_test.txt','rt')
read_array = file.readlines()
title=[]
body=[]
for line in read_array:
    data = line.split(' , ') #file id, headline, leading_paragraph and full_text
    title.append(data[1])
    body.append(data[3])
file.close()
print(title[:2])
# print(body[2])

["'Baghdad Gallery Owner Hopes Culture Can Dispel Hate'", "'Sweet and Sour Sit Down to Dessert'"]


### Tokenize and Sentence segmentation

In [3]:
# tokenize
from nltk import word_tokenize
import nltk
t = title[1]
b = body[1]
# print(word_tokenize(b))
tokens = []
for i in range(len(body)):
    tokens.extend(nltk.wordpunct_tokenize(body[i]))

# tokens = tokens[20:] #select tokens
# text = nltk.Text(tokens)
print(tokens[:20])

["'", 'He', 'may', 'be', 'the', 'last', 'hopeful', 'man', 'in', 'Iraq', '.', 'Amid', 'the', 'violence', ',', 'the', 'crumbling', 'economy', 'and', 'rising']


In [4]:
# stem
# do we need to do stemming?
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()
[porter.stem(t) for t in tokens]
# [lancaster.stem(t) for t in tokens]

["'",
 'rhubarb',
 'is',
 'an',
 'alarmingli',
 'sour',
 'veget',
 'pass',
 'off',
 'as',
 'a',
 'fruit',
 ',',
 'but',
 'requir',
 'a',
 'huge',
 'mound',
 'of',
 'sugar',
 'to',
 'effect',
 'the',
 'transform',
 '.',
 'crumb',
 'cake',
 'is',
 'a',
 'huge',
 'mound',
 'of',
 'sugar',
 'disguis',
 'as',
 'a',
 'cake',
 ',',
 'but',
 'demand',
 'a',
 'brace',
 'counterpoint',
 '--',
 'say',
 'a',
 'swallow',
 'of',
 'coffe',
 'or',
 'tea',
 '--',
 'to',
 'allay',
 'it',
 'cloy',
 'sweet',
 '.',
 'these',
 'two',
 'truth',
 'coexist',
 'in',
 'my',
 'mind',
 'without',
 'overlap',
 'until',
 'I',
 'bit',
 'into',
 'a',
 'piec',
 'of',
 'crumb',
 'cake',
 'so',
 'textur',
 'perfect',
 '(',
 'soft',
 'sliver',
 'of',
 'cake',
 'top',
 'by',
 'a',
 'deep',
 'layer',
 'of',
 'grape',
 '-',
 'size',
 'crumb',
 '),',
 'yet',
 'so',
 'toothachingli',
 'sweet',
 'that',
 'the',
 'onli',
 'antidot',
 'wa',
 'suck',
 'on',
 'the',
 'lemon',
 'in',
 'my',
 'seltzer',
 '.',
 'the',
 'sour',
 'of',


In [49]:
# sentence segmentation 1, not very good
nltk.download('punkt')
sents = nltk.sent_tokenize(b)
pprint.pprint(sents[:3])

[nltk_data] Downloading package punkt to /home/huyue012/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
["'RHUBARB is an alarmingly sour vegetable passed off as a fruit, but "
 'requiring a huge mound of sugar to effect the transformation.Crumb cake is a '
 'huge mound of sugar disguised as a cake, but demanding a bracing '
 'counterpoint -- say a swallow of coffee or tea -- to allay its cloying '
 'sweetness.These two truths coexisted in my mind without overlapping until I '
 'bit into a piece of crumb cake so texturally perfect (soft sliver of cake '
 'topped by a deep layer of grape-size crumbs), yet so toothachingly sweet '
 'that the only antidote was sucking on the lemon in my seltzer.The sourness '
 'of the lemon immediately made me think about the rhubarb I had in the '
 'fridge.',
 'It occurred to me that, instead of cutting its tartness with a mountain of '
 "sugar, why not mix the rhubarb into a crumb cake to cut the cake's "
 "sweetness?It was an ''Aha!''",
 'mom

In [53]:

sentences = np.array(list(brown.sents()), dtype=object)
fmt = (len(sentences), sum(map(len, sentences)))
print("Loaded {:,} sentences ({:g} tokens)".format(*fmt))

if shuffle:
    rng = np.random.RandomState(shuffle)
    rng.shuffle(sentences)  # in-place
split_idx = int(split * len(sentences))
train_sentences = sentences[:split_idx]
test_sentences = sentences[split_idx:]

fmt = (len(train_sentences), sum(map(len, train_sentences)))
print("Training set: {:,} sentences ({:,} tokens)".format(*fmt))
fmt = (len(test_sentences), sum(map(len, test_sentences)))
print("Test set: {:,} sentences ({:,} tokens)".format(*fmt))

return train_sentences, test_sentences

array([list(['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.']),
       list(['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'])],
      dtype=object)

In [4]:
# sentence segmentation 2, use option 2
nltk.download('treebank')
import sent_segment # py file

# need to flattern the list?
sentences = sent_segment.segment_sentences(tokens)
len(sentences)

[nltk_data] Downloading package treebank to
[nltk_data]     /home/huyue012/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


19847

In [57]:
len(title)

1000

### Further Preprocessing
leverage w266_common module
tokenize --> canonicalize digit --> canonicalize word --> vocabuluary

In [5]:
# Helper libraries

reload(utils)
# words = [w.lower() for w in text] # lower, DG, UNK...
# vocab = sorted(set(words))[1:100000]
V = 10000
vocab = utils.build_vocab(tokens, V)
# vocab, train_ids, test_ids = utils.load_data(body, title, split=0.8, V=V, shuffle=42)


Vocabulary: 10,000 types


## Embedding

### Sentence Embedding
For extractive modeling - sentence ranking

In [5]:
# # !pip install tensorflow_hub
# import tensorflow as tf
# import tensorflow_hub as hub
# embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder/2")
# embeddings = embed([
# "The quick brown fox jumps over the lazy dog.",
# "I am a sentence for which I would like to get its embedding"])
# session=tf.Session()
# session.run([tf.global_variables_initializer(), tf.tables_initializer()])
# print (tf.Session().run(embeddings))

### Word Embedding

## Follow NMT Tutorial
[https://github.com/tensorflow/nmt![image.png](attachment:image.png)]

### Embedding
With big corpus, we can train embedding from scratch instead of using trained model.

In [6]:
# Construct embedding layer
with tf.name_scope("Embedding_Layer"):
    self.W_in_ = tf.Variable(tf.random_uniform([self.V, self.H], -1.0, 1.0), name="W_in")
    # embedding_lookup gives shape (batch_size, max_time, H)
    self.x_ = tf.nn.embedding_lookup(self.W_in_, self.input_w_)

# Embedding
embedding_encoder = variable_scope.get_variable(
    "embedding_encoder", [src_vocab_size, embedding_size], ...)
# Look up embedding:
#   encoder_inputs: [max_time, batch_size]
#   encoder_emb_inp: [max_time, batch_size, embedding_size]
encoder_emb_inp = embedding_ops.embedding_lookup(
    embedding_encoder, encoder_inputs)

NameError: name 'variable_scope' is not defined

### Encoder

In [7]:
# Build RNN cell
encoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

# Run Dynamic RNN
#   encoder_outputs: [max_time, batch_size, num_units]
#   encoder_state: [batch_size, num_units]
encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_emb_inp,
    sequence_length=source_sequence_length, time_major=True)

NameError: name 'num_units' is not defined

### Decoder

In [None]:
# The decoder also needs to have access to the source information, to initialize it with the last hidden state of the encoder
# Build RNN cell
decoder_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units)

# Helper
helper = tf.contrib.seq2seq.TrainingHelper(
    decoder_emb_inp, decoder_lengths, time_major=True)
# Decoder
decoder = tf.contrib.seq2seq.BasicDecoder(
    decoder_cell, helper, encoder_state,
    output_layer=projection_layer)
# Dynamic decoding
outputs, _ = tf.contrib.seq2seq.dynamic_decode(decoder, ...)
logits = outputs.rnn_output

In [None]:
projection_layer = layers_core.Dense(
    tgt_vocab_size, use_bias=False)

### Loss

In [None]:
crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
    labels=decoder_outputs, logits=logits)
train_loss = (tf.reduce_sum(crossent * target_weights) /
    batch_size)

** Below are just placeholders **
## Training Models

In [None]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        cost = 0.0
        vals = session.run(ops, feed_dict)
        cost = vals[0] #loss
        h = vals[1] #final_h


          
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

In [None]:
def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [None]:
# Load the dataset
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

In [40]:
V = 10000

from . import vocabulary
if isinstance(tokens, list):
    token_feed = (canonicalize_word(w) for w in tokens)
    vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)
else:
    token_feed = (canonicalize_word(w) for w in tokens.words())
    vocab = vocabulary.Vocabulary(token_feed, size=V, **kw)

print("Vocabulary: {:,} types".format(vocab.size))


NameError: name 'kw' is not defined

In [None]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.05 #default 0.01
num_epochs = 5

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "/tmp/w266/a3_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [None]:
# Will print status every this many seconds
print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.    
        cost = run_epoch(lm, session, bi, 
                 learning_rate=learning_rate, train=True, 
                 verbose=True, tick_s=10)
        print("loss: {:.03f}  (perplexity: {:.02f})".format(cost, np.exp(cost)))

        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
#         if epoch == num_epochs:
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, train_ids, name="Train set")
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, test_ids, name="Test set")
        print("")

    
    # Save final model
    saver.save(session, trained_filename)

## Score New Data

In [None]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], bytes):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            results.append((score, words))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)

        # Print results
        for score, words in results:
            print("\"{:s}\" : {:.02f}".format(" ".join(words), score))

In [None]:
sents = ["once upon a time",
         "the quick brown fox jumps over the lazy dog"]
load_and_score([s.split() for s in sents])