# Recurrent Neural Network Language Model

In [3]:
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division

import json, os, re, shutil, sys, time
from importlib import reload
import collections, itertools
import unittest
from IPython.display import display, HTML

# NLTK for NLP utils and corpora
import nltk

# NumPy and TensorFlow
import numpy as np
import tensorflow as tf
assert(tf.__version__.startswith("1."))

# Helper libraries
from w266_common import utils, vocabulary, tf_embed_viz

# Your code
import rnnlm; reload(rnnlm)
import rnnlm_test; reload(rnnlm_test)

  return f(*args, **kwds)


<module 'rnnlm_test' from '/Users/jeylee/MIDS/W266/final/rnnlm_test.py'>

## Construct Graph

In [4]:
reload(rnnlm)

TF_GRAPHDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_graph"

# Clear old log directory.
shutil.rmtree(TF_GRAPHDIR, ignore_errors=True)

lm = rnnlm.RNNLM(V=10000, H=200, num_layers=2)
lm.BuildCoreGraph()
lm.BuildTrainGraph()
lm.BuildSamplerGraph()

summary_writer = tf.summary.FileWriter(TF_GRAPHDIR, lm.graph)

reload(rnnlm); reload(rnnlm_test)
utils.run_tests(rnnlm_test, ["TestRNNLMCore", "TestRNNLMTrain", "TestRNNLMSampler"])

test_shapes_embed (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_output (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_recurrent (rnnlm_test.TestRNNLMCore) ... ok
test_shapes_train (rnnlm_test.TestRNNLMTrain) ... ok
test_shapes_sample (rnnlm_test.TestRNNLMSampler) ... ok

----------------------------------------------------------------------
Ran 5 tests in 2.273s

OK


The code above will load your implementation, construct the graph, and write a logdir for TensorBoard. You can bring up TensorBoard with:
```
cd assignment/a3
tensorboard --logdir /tmp/w266/a3_graph --port 6006
```
Tensorboard : http://localhost:6006/

## Train

In [5]:
def run_epoch(lm, session, batch_iterator,
              train=False, verbose=False,
              tick_s=10, learning_rate=None):
    assert(learning_rate is not None)
    start_time = time.time()
    tick_time = start_time  # for showing status
    total_cost = 0.0  # total cost, summed over all words
    total_batches = 0
    total_words = 0

    if train:
        train_op = lm.train_step_
        use_dropout = True
        loss = lm.train_loss_
    else:
        train_op = tf.no_op()
        use_dropout = False  # no dropout at test time
        loss = lm.loss_  # true loss, if train_loss is an approximation

    for i, (w, y) in enumerate(batch_iterator):
        # At first batch in epoch, get a clean intitial state.
        if i == 0:
            h = session.run(lm.initial_h_, {lm.input_w_: w})

        feed_dict = {
            lm.input_w_: w,
            lm.target_y_: y,
            lm.initial_h_: h,
            lm.learning_rate_: learning_rate,
            lm.use_dropout_: use_dropout
        }
        ops = [loss, lm.final_h_, train_op]        
        #### YOUR CODE HERE ####
        # session.run(...) the ops with the feed_dict constructed above.
        # Ensure "cost" becomes the value of "loss".
        # Hint: see "ops" for other variables that need updating in this loop.
        
        cost, h, _ = session.run([loss, lm.final_h_ ,train_op], feed_dict=feed_dict)
        
        #### END(YOUR CODE) ####
        total_cost += cost
        total_batches = i + 1
        total_words += w.size  # w.size = batch_size * max_time

        ##
        # Print average loss-so-far for epoch
        # If using train_loss_, this may be an underestimate.
        if verbose and (time.time() - tick_time >= tick_s):
            avg_cost = total_cost / total_batches
            avg_wps = total_words / (time.time() - start_time)
            print("[batch {:d}]: seen {:d} words at {:.1f} wps, loss = {:.3f}".format(
                i, total_words, avg_wps, avg_cost))
            tick_time = time.time()  # reset time ticker

    return total_cost / total_batches

def score_dataset(lm, session, ids, name="Data"):
    # For scoring, we can use larger batches to speed things up.
    bi = utils.rnnlm_batch_generator(ids, batch_size=100, max_time=100)
    cost = run_epoch(lm, session, bi, 
                     learning_rate=0.0, train=False, 
                     verbose=False, tick_s=3600)
    print("{:s}: avg. loss: {:.03f}  (perplexity: {:.02f})".format(name, cost, np.exp(cost)))
    return cost

In [6]:
reload(rnnlm); reload(rnnlm_test)
th = rnnlm_test.RunEpochTester("test_toy_model")
th.setUp(); th.injectCode(run_epoch, score_dataset)
unittest.TextTestRunner(verbosity=2).run(th)

test_toy_model (rnnlm_test.RunEpochTester) ... 

[batch 149]: seen 7500 words at 7462.5 wps, loss = 0.698
[batch 299]: seen 15000 words at 7469.5 wps, loss = 0.446
[batch 455]: seen 22800 words at 7564.1 wps, loss = 0.336
[batch 614]: seen 30750 words at 7658.6 wps, loss = 0.275
[batch 776]: seen 38850 words at 7740.5 wps, loss = 0.234
[batch 926]: seen 46350 words at 7699.8 wps, loss = 0.207
[batch 1070]: seen 53550 words at 7627.1 wps, loss = 0.188
[batch 1219]: seen 61000 words at 7599.4 wps, loss = 0.172
[batch 1379]: seen 69000 words at 7643.4 wps, loss = 0.160
[batch 1538]: seen 76950 words at 7671.7 wps, loss = 0.149
[batch 1695]: seen 84800 words at 7684.6 wps, loss = 0.140
[batch 1833]: seen 91700 words at 7618.9 wps, loss = 0.134
[batch 1974]: seen 98750 words at 7572.8 wps, loss = 0.128
[batch 2135]: seen 106800 words at 7606.4 wps, loss = 0.122
[batch 2271]: seen 113600 words at 7548.9 wps, loss = 0.118
[batch 2373]: seen 118700 words at 7391.2 wps, loss = 0.115
[batch 2506]: seen 125350 words at 7346.5 wps, loss = 0.111


ok

----------------------------------------------------------------------
Ran 1 test in 29.945s

OK


<unittest.runner.TextTestResult run=1 errors=0 failures=0>

In [7]:
# Load the dataset
V = 10000
vocab, train_ids, test_ids = utils.load_corpus("brown", split=0.8, V=V, shuffle=42)

[nltk_data] Downloading package brown to /Users/jeylee/nltk_data...
[nltk_data]   Package brown is already up-to-date!
Vocabulary: 10,000 types
Loaded 57,340 sentences (1.16119e+06 tokens)
Training set: 45,872 sentences (924,077 tokens)
Test set: 11,468 sentences (237,115 tokens)


In [8]:
# Training parameters
max_time = 25
batch_size = 100
learning_rate = 0.01
num_epochs = 10

# Model parameters
model_params = dict(V=vocab.size, 
                    H=200, 
                    softmax_ns=200,
                    num_layers=2)

TF_SAVEDIR = "/Users/jeylee/mids/W266/tmp/w266/a3_model"
checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm")
trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained")

In [9]:
# Will print status every this many seconds
print_interval = 5

lm = rnnlm.RNNLM(**model_params)
lm.BuildCoreGraph()
lm.BuildTrainGraph()

# Explicitly add global initializer and variable saver to LM graph
with lm.graph.as_default():
    initializer = tf.global_variables_initializer()
    saver = tf.train.Saver()
    
# Clear old log directory
shutil.rmtree(TF_SAVEDIR, ignore_errors=True)
if not os.path.isdir(TF_SAVEDIR):
    os.makedirs(TF_SAVEDIR)

with tf.Session(graph=lm.graph) as session:
    # Seed RNG for repeatability
    tf.set_random_seed(42)

    session.run(initializer)

    for epoch in range(1,num_epochs+1):
        t0_epoch = time.time()
        bi = utils.rnnlm_batch_generator(train_ids, batch_size, max_time)
        print("[epoch {:d}] Starting epoch {:d}".format(epoch, epoch))
        #### YOUR CODE HERE ####
        # Run a training epoch.
        
        run_epoch(lm, session, bi, learning_rate=learning_rate, train=True, verbose=False, tick_s=10)
        
        #### END(YOUR CODE) ####
        print("[epoch {:d}] Completed in {:s}".format(epoch, utils.pretty_timedelta(since=t0_epoch)))
    
        # Save a checkpoint
        saver.save(session, checkpoint_filename, global_step=epoch)
    
        ##
        # score_dataset will run a forward pass over the entire dataset
        # and report perplexity scores. This can be slow (around 1/2 to 
        # 1/4 as long as a full epoch), so you may want to comment it out
        # to speed up training on a slow machine. Be sure to run it at the 
        # end to evaluate your score.
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, train_ids, name="Train set")
        print("[epoch {:d}]".format(epoch), end=" ")
        score_dataset(lm, session, test_ids, name="Test set")
        print("")
    
    # Save final model
    saver.save(session, trained_filename)

[epoch 1] Starting epoch 1
[epoch 1] Completed in 0:01:53
[epoch 1] Train set: avg. loss: 5.404  (perplexity: 222.30)
[epoch 1] Test set: avg. loss: 5.440  (perplexity: 230.54)

[epoch 2] Starting epoch 2
[epoch 2] Completed in 0:01:40
[epoch 2] Train set: avg. loss: 5.211  (perplexity: 183.23)
[epoch 2] Test set: avg. loss: 5.265  (perplexity: 193.39)

[epoch 3] Starting epoch 3
[epoch 3] Completed in 0:02:01
[epoch 3] Train set: avg. loss: 5.111  (perplexity: 165.90)
[epoch 3] Test set: avg. loss: 5.177  (perplexity: 177.19)

[epoch 4] Starting epoch 4
[epoch 4] Completed in 0:01:44
[epoch 4] Train set: avg. loss: 5.034  (perplexity: 153.55)
[epoch 4] Test set: avg. loss: 5.113  (perplexity: 166.15)

[epoch 5] Starting epoch 5
[epoch 5] Completed in 0:01:30
[epoch 5] Train set: avg. loss: 4.979  (perplexity: 145.30)
[epoch 5] Test set: avg. loss: 5.070  (perplexity: 159.11)

[epoch 6] Starting epoch 6
[epoch 6] Completed in 0:01:30
[epoch 6] Train set: avg. loss: 4.936  (perplexity: 

## Calculate Linguistic Probability

Now that we've trained our RNNLM, let's test a few properties of the model to see how well it learns linguistic phenomena. We'll do this with a scoring task: given two or more test sentences, our model should score the more plausible (or more correct) sentence with a higher log-probability.

We'll define a scoring function to help us:

In [10]:
def score_seq(lm, session, seq, vocab):
    """Score a sequence of words. Returns total log-probability."""
    padded_ids = vocab.words_to_ids(utils.canonicalize_words(["<s>"] + seq + ["</s>"], 
                                                             wordset=vocab.word_to_id))
    w = np.reshape(padded_ids[:-1], [1,-1])
    y = np.reshape(padded_ids[1:],  [1,-1])
    h = session.run(lm.initial_h_, {lm.input_w_: w})
    feed_dict = {lm.input_w_:w,
                 lm.target_y_:y,
                 lm.initial_h_:h,
                 lm.dropout_keep_prob_: 1.0}
    # Return log(P(seq)) = -1*loss
    return -1*session.run(lm.loss_, feed_dict)

def load_and_score(inputs, sort=False):
    """Load the trained model and score the given words."""
    lm = rnnlm.RNNLM(**model_params)
    lm.BuildCoreGraph()
    
    with lm.graph.as_default():
        saver = tf.train.Saver()

    with tf.Session(graph=lm.graph) as session:  
        # Load the trained model
        saver.restore(session, trained_filename)

        if isinstance(inputs[0], str) or isinstance(inputs[0], bytes):
            inputs = [inputs]

        # Actually run scoring
        results = []
        for words in inputs:
            score = score_seq(lm, session, words, vocab)
            results.append((score, words))

        # Sort if requested
        if sort: results = sorted(results, reverse=True)

        # Print results
        for score, words in results:
            print("\"{:s}\" : {:.02f}".format(" ".join(words), score))

Now we can test as:

In [11]:
sents = ["We’re seeing unhealthy air quality throughout most of the Bay Area",
         "We’re seeing unhealthy air quality all over most of the Bay Area"]
load_and_score([s.split() for s in sents])

INFO:tensorflow:Restoring parameters from /Users/jeylee/mids/W266/tmp/w266/a3_model/rnnlm_trained
"We’re seeing unhealthy air quality throughout most of the Bay Area" : -7.38
"We’re seeing unhealthy air quality all over most of the Bay Area" : -7.25
