Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
from __future__ import print_function
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified %s' % filename)
  else:
    print(statinfo.st_size)
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  with zipfile.ZipFile(filename) as f:
    name = f.namelist()[0]
    data = tf.compat.as_str(f.read(name))
  return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
text[:20]

' anarchism originate'

Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [5]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print('Unexpected character: %s' % char)
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print(char2id('a'), char2id('z'), char2id(' '), char2id('ï'))
print(id2char(1), id2char(26), id2char(0))

Unexpected character: ï
1 26 0 0
a z  


In [12]:
vocabulary_size

27

Function to generate a training batch for the LSTM model.

In [6]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size // batch_size
    self._cursor = [ offset * segment for offset in range(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in range(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in range(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (most likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print(batches2string(train_batches.next()))
print(batches2string(train_batches.next()))
print(batches2string(valid_batches.next()))
print(batches2string(valid_batches.next()))

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [7]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in range(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [9]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [10]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.295918 learning rate: 10.000000
Minibatch perplexity: 27.00
xiltbmwscxcnefit wlgnjkatct jgow  r nbzeslaki  rt onnl ro jfpz i  aercerzh eugxj
nsohtstnnlzcy  a gnagujci beyat  x ztdiercl nv  e hj g itusrmzom efzziuw   oz rc
nniaae nnfnvfarqefdan irfuz fzcxevcgtairwagitanid  jubo dk iwuv sljnm tyiutvn rp
x ar tkc dtwxafeym  t icjhus srptdndrnyr wb wvawd nixaphtpkrjx c  rfn w bb r  vi
ztpfs  upo w itn xcum y aiejfoiajsryi b ieeni qiieaxsexrpdsbtz hreztiaeenjzoyanl
Validation set perplexity: 20.13
Average loss at step 100: 2.596021 learning rate: 10.000000
Minibatch perplexity: 10.84
Validation set perplexity: 10.23
Average loss at step 200: 2.251585 learning rate: 10.000000
Minibatch perplexity: 8.79
Validation set perplexity: 8.67
Average loss at step 300: 2.099724 learning rate: 10.000000
Minibatch perplexity: 7.46
Validation set perplexity: 7.97
Average loss at step 400: 2.000765 learning rate: 10.000000
Minibatch perplexity: 7.44
Validation set per

Validation set perplexity: 4.45
Average loss at step 4500: 1.615460 learning rate: 10.000000
Minibatch perplexity: 5.34
Validation set perplexity: 4.67
Average loss at step 4600: 1.614482 learning rate: 10.000000
Minibatch perplexity: 5.07
Validation set perplexity: 4.65
Average loss at step 4700: 1.627700 learning rate: 10.000000
Minibatch perplexity: 5.33
Validation set perplexity: 4.59
Average loss at step 4800: 1.632806 learning rate: 10.000000
Minibatch perplexity: 4.57
Validation set perplexity: 4.55
Average loss at step 4900: 1.634851 learning rate: 10.000000
Minibatch perplexity: 5.16
Validation set perplexity: 4.73
Average loss at step 5000: 1.608837 learning rate: 1.000000
Minibatch perplexity: 4.52
kersing to invil sahper p other penlelder seven sudents norting of their surnati
keng tavary granmast for prefises the migration one six zero zero a liuns him ha
ple dypter savide is sholleting prepekies more or wenther grand s a eight th con
cily received in dese the a burn from 

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

In [18]:
tx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
jx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
zx = tf.concat([tx, jx],1)

In [19]:
zx.shape

TensorShape([Dimension(27), Dimension(128)])

In [21]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Concatenated matrix
  sx = tf.concat([ix, fx, cx, ox], 1)
  sm = tf.concat([im, fm, cm, om], 1)
  sb = tf.concat([ib, fb, cb, ob], 1)
    
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
#     input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
#     forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
#     update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    smultiple = tf.matmul(i, sx) + tf.matmul(o, sm) + sb
    input_d, forget_d, update, output_d = tf.split(smultiple, 4, 1)
    input_gate = tf.sigmoid(input_d)
    forget_gate = tf.sigmoid(forget_d)
    output_gate = tf.sigmoid(output_d)
    state = forget_gate * state + input_gate * tf.tanh(update)
#    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [22]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.294599 learning rate: 10.000000
Minibatch perplexity: 26.97
w fcchpceyn m  eh usfycarghia  nyvghelybi z eptne lm eeefgr wvuarl yisjbyrrch bk
zzvir t le  ehhjs mj vrnlpajazoijsezoxr esdn jnfzleg me bcpe r uidh sxxeigjcspio
xnhra pescp cxphmz eeuounyvhfxcxemsh qsh efrebsmwool mr teqlrm sk a lmryet tdvow
u kwhy  v orcskaeesu jfiodo  bc cdxtuapizv izlmwlji pscysreagyumume meufsroaxyam
likqv rnhfhb wf cabkaehropwscx   e x sfnabjxzcihxnrnr rd erjs bkbepegyoq crprqur
Validation set perplexity: 20.03
Average loss at step 100: 2.576622 learning rate: 10.000000
Minibatch perplexity: 10.34
Validation set perplexity: 10.71
Average loss at step 200: 2.246409 learning rate: 10.000000
Minibatch perplexity: 8.54
Validation set perplexity: 8.94
Average loss at step 300: 2.086570 learning rate: 10.000000
Minibatch perplexity: 6.36
Validation set perplexity: 8.21
Average loss at step 400: 2.035812 learning rate: 10.000000
Minibatch perplexity: 8.05
Validation set per

Validation set perplexity: 4.88
Average loss at step 4500: 1.646730 learning rate: 10.000000
Minibatch perplexity: 5.36
Validation set perplexity: 4.93
Average loss at step 4600: 1.623779 learning rate: 10.000000
Minibatch perplexity: 5.46
Validation set perplexity: 4.85
Average loss at step 4700: 1.625710 learning rate: 10.000000
Minibatch perplexity: 4.70
Validation set perplexity: 4.85
Average loss at step 4800: 1.610068 learning rate: 10.000000
Minibatch perplexity: 4.71
Validation set perplexity: 4.82
Average loss at step 4900: 1.621098 learning rate: 10.000000
Minibatch perplexity: 5.32
Validation set perplexity: 4.75
Average loss at step 5000: 1.613239 learning rate: 1.000000
Minibatch perplexity: 4.82
n also tranch wompo cannomies uniness leasing the eightin from diolest is one ni
d secharenful popuacition mosery of the times the accewning confeal serse imstar
flemally rocaical supplinently the being the humandes eight the ainnats they of 
ground on the muss metisthered the pap

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

Question a: let's add an embedding layer for the input chars

In [29]:
embedding_size = 128
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Embedding layer: input, embedding output
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))

  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    embed = tf.nn.embedding_lookup(embeddings, tf.argmax(i, dimension=1))
    output, state = lstm_cell(embed, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  sample_input_embedding = tf.nn.embedding_lookup(embeddings, tf.argmax(sample_input, dimension=1))
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_embedding, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [30]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.295465 learning rate: 10.000000
Minibatch perplexity: 26.99
br mt oohuinpt nmrmg aer r c g rii  ne rk hug u xufe xubc kao ss siq zkaqh op mc
m ust oey  oa hn evyr ec lnspi hxast q bft nfnunhouzfe ot q t vbznsepi   iminiy 
veed d d d c hfk hwkse h nelxf e yqhzq eq mohhi ohj o  ggt kit  ownn ykx khn  on
zwx oyunt spneit  mjrhwr befaeper s poea a ot a od qobd k jit q woyg tbzqse m  t
t vu d d sd xbkrubkn t a uafpmud leqefye qiao  wt jvr c c nhf  neg ahx oasm fq  
Validation set perplexity: 23.33
Average loss at step 100: 2.307625 learning rate: 10.000000
Minibatch perplexity: 10.21
Validation set perplexity: 8.97
Average loss at step 200: 2.020914 learning rate: 10.000000
Minibatch perplexity: 6.85
Validation set perplexity: 7.52
Average loss at step 300: 1.917944 learning rate: 10.000000
Minibatch perplexity: 6.10
Validation set perplexity: 6.78
Average loss at step 400: 1.865545 learning rate: 10.000000
Minibatch perplexity: 6.05
Validation set perp

Validation set perplexity: 5.02
Average loss at step 4500: 1.635996 learning rate: 10.000000
Minibatch perplexity: 5.01
Validation set perplexity: 4.81
Average loss at step 4600: 1.635203 learning rate: 10.000000
Minibatch perplexity: 5.00
Validation set perplexity: 4.87
Average loss at step 4700: 1.612105 learning rate: 10.000000
Minibatch perplexity: 5.43
Validation set perplexity: 5.13
Average loss at step 4800: 1.594385 learning rate: 10.000000
Minibatch perplexity: 5.15
Validation set perplexity: 5.12
Average loss at step 4900: 1.608753 learning rate: 10.000000
Minibatch perplexity: 5.01
Validation set perplexity: 4.87
Average loss at step 5000: 1.631268 learning rate: 1.000000
Minibatch perplexity: 5.40
tant libiois unditional repla from equiptity prummits micch offten the compority
batmany electaer in becaur anatuless zame bnarkh election there or includiad wor
jalions and to gians as won the steven two quast alvant types siletch en justpit
ings called edition wholibrits at borc

Question b: write a bigram-based LSTM

In [34]:
embedding_size = 128
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Embedding layer: input, embedding output
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size * vocabulary_size, embedding_size], -1.0, 1.0))
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))

  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  single_char_inputs = train_data[:num_unrollings]
  train_inputs = zip(single_char_inputs[:-1], single_char_inputs[1:])
  train_labels = train_data[2:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    index = tf.argmax(i[0], dimension=1) + tf.argmax(i[1], dimension=1) * vocabulary_size
    embed = tf.nn.embedding_lookup(embeddings, index)
    output, state = lstm_cell(embed, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = list()
  for _ in range(2):
    sample_input.append(tf.placeholder(tf.float32, shape=[1, vocabulary_size]))
  sample_input_index = tf.argmax(sample_input[0], dimension=1) + tf.argmax(sample_input[1], dimension=1) * vocabulary_size
  sample_input_embedding = tf.nn.embedding_lookup(embeddings, sample_input_index)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_embedding, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [37]:
import collections
num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2)

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = collections.deque(maxlen=2)
          for _ in range(2):
              feed.append(random_distribution())
          sentence = characters(feed[0])[0] + characters(feed[1])[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input[0]: feed[0], sample_input[1]: feed[1]})
            feed.append(sample(prediction))
            sentence += characters(feed[1])[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input[0]: b[0], sample_input[1]: b[1]})
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.301809 learning rate: 10.000000
Minibatch perplexity: 27.16
ssnvyvg   nunianb xgsipis tta duymrzl cwettrsdtmq   w sb j elfxt thesmgkia gkahdm
qkmoyeehaimtsi etiwer iih b ric  exgdvrrpvn um rj cnycnio anrovugwertavjovelbbtwf
iuus jkswaeieqbpl ekrio ixdpyjfbu  n afdscvxbonlx  mvy jnu s u wzq f kolagefsmnis
trr vagbqsk eeetrtjeekxiiueta kzpo bfctpi urj p q icugewzazidvsav  awe trvoto ebc
dccnvp w eoorubitvojbujwraaeoux xoweytwaarb sqmpkqjtd fituh mroenhvkdnzenuz bdepm
Validation set perplexity: 20.38
Average loss at step 100: 2.277014 learning rate: 10.000000
Minibatch perplexity: 7.57
Validation set perplexity: 9.03
Average loss at step 200: 1.973701 learning rate: 10.000000
Minibatch perplexity: 7.09
Validation set perplexity: 8.27
Average loss at step 300: 1.885835 learning rate: 10.000000
Minibatch perplexity: 6.98
Validation set perplexity: 7.72
Average loss at step 400: 1.828601 learning rate: 10.000000
Minibatch perplexity: 6.01
Validation set 

Validation set perplexity: 7.44
Average loss at step 4500: 1.579846 learning rate: 10.000000
Minibatch perplexity: 4.75
Validation set perplexity: 7.21
Average loss at step 4600: 1.584294 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 7.23
Average loss at step 4700: 1.597102 learning rate: 10.000000
Minibatch perplexity: 5.38
Validation set perplexity: 7.15
Average loss at step 4800: 1.597276 learning rate: 10.000000
Minibatch perplexity: 5.08
Validation set perplexity: 7.41
Average loss at step 4900: 1.613589 learning rate: 10.000000
Minibatch perplexity: 5.31
Validation set perplexity: 7.24
Average loss at step 5000: 1.617901 learning rate: 1.000000
Minibatch perplexity: 4.54
qnse as of lab primaralt may blues about extents intellectmen two the forms event
spana forus jpgheor by and the demanify doila is for as ut macture mix randean ge
away do civy of only text to pant extencel albook macarchapplicipainition kartici
qk and splaissaction armire izimal 

Question c: Add dropout

In [38]:
embedding_size = 128
num_nodes = 64


graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Embedding layer: input, embedding output
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size * vocabulary_size, embedding_size], -1.0, 1.0))
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([embedding_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))

  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  # Keep probability for dropout
  kp = tf.placeholder(tf.float32)

  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    i = tf.nn.dropout(i, kp)
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    output = tf.nn.dropout(output_gate * tf.tanh(state), kp)
    return output, state

  # Input data.
  train_data = list()
  for _ in range(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  single_char_inputs = train_data[:num_unrollings]
  train_inputs = zip(single_char_inputs[:-1], single_char_inputs[1:])
  train_labels = train_data[2:]  # labels are inputs shifted by one time step.


  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    index = tf.argmax(i[0], dimension=1) + tf.argmax(i[1], dimension=1) * vocabulary_size
    embed = tf.nn.embedding_lookup(embeddings, index)
    output, state = lstm_cell(embed, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(outputs, 0), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        labels=tf.concat(train_labels, 0), logits=logits))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = list()
  for _ in range(2):
    sample_input.append(tf.placeholder(tf.float32, shape=[1, vocabulary_size]))
  sample_input_index = tf.argmax(sample_input[0], dimension=1) + tf.argmax(sample_input[1], dimension=1) * vocabulary_size
  sample_input_embedding = tf.nn.embedding_lookup(embeddings, sample_input_index)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input_embedding, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [41]:
import collections
num_steps = 7001
summary_frequency = 100

valid_batches = BatchGenerator(valid_text, 1, 2)

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print('Initialized')
  mean_loss = 0
  for step in range(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in range(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
      feed_dict[kp] = 0.9
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print(
        'Average loss at step %d: %f learning rate: %f' % (step, mean_loss, lr))
      mean_loss = 0
      labels = np.concatenate(list(batches)[2:])
      print('Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels))))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print('=' * 80)
        for _ in range(5):
          feed = collections.deque(maxlen=2)
          for _ in range(2):
              feed.append(random_distribution())
          sentence = characters(feed[0])[0] + characters(feed[1])[0]
          reset_sample_state.run()
          for _ in range(79):
            prediction = sample_prediction.eval({sample_input[0]: feed[0], sample_input[1]: feed[1], kp: 1.0})
            feed.append(sample(prediction))
            sentence += characters(feed[1])[0]
          print(sentence)
        print('=' * 80)
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in range(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input[0]: b[0], sample_input[1]: b[1], kp: 1.0})
        valid_logprob = valid_logprob + logprob(predictions, b[2])
      print('Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size)))

Initialized
Average loss at step 0: 3.319466 learning rate: 10.000000
Minibatch perplexity: 27.65
lrvsxt kdzweks twtq wzspihtm xns um e bpydoy t tcoenn  xwueggmyi   v  coaec hptpy
gf t ym fbiqxk lth uisxtd tesa icpktrmte   abie nanfqt rwenvag imxevnutvizatee qm
qrctuowsrlhs etxl zecfu tjtm   cn  pb oagugt  bu rhttrhwavmdcuhybiouf rltorsknebs
nb  z urifytrzehvaee  ooqtcnabduneakoe zp a cnih mhbttbobsjnzocae am  oesuy  ne i
iwlq ond egxweykleojineellmietxf htxcn yruns u o  uleo q q q etvdemlgekt teoleo  
Validation set perplexity: 19.65
Average loss at step 100: 2.346409 learning rate: 10.000000
Minibatch perplexity: 7.56
Validation set perplexity: 9.05
Average loss at step 200: 2.066781 learning rate: 10.000000
Minibatch perplexity: 8.36
Validation set perplexity: 8.17
Average loss at step 300: 1.965051 learning rate: 10.000000
Minibatch perplexity: 6.92
Validation set perplexity: 7.72
Average loss at step 400: 1.940962 learning rate: 10.000000
Minibatch perplexity: 6.28
Validation set 

Validation set perplexity: 7.15
Average loss at step 4500: 1.760256 learning rate: 10.000000
Minibatch perplexity: 6.68
Validation set perplexity: 6.90
Average loss at step 4600: 1.757826 learning rate: 10.000000
Minibatch perplexity: 5.39
Validation set perplexity: 6.95
Average loss at step 4700: 1.764083 learning rate: 10.000000
Minibatch perplexity: 6.25
Validation set perplexity: 6.85
Average loss at step 4800: 1.752895 learning rate: 10.000000
Minibatch perplexity: 6.36
Validation set perplexity: 6.81
Average loss at step 4900: 1.736090 learning rate: 10.000000
Minibatch perplexity: 5.62
Validation set perplexity: 6.86
Average loss at step 5000: 1.782360 learning rate: 1.000000
Minibatch perplexity: 6.28
scotally palary st for  a n well tend human ich erssenation that tise promini esp
rom sekna a partically and you huntilla and them well the chumatineminai likeroun
qhublacted american the weere bearfarzaster doment domilative farled making s rec
dnatain fought eptial hering atred 

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

Process the data

In [5]:
from tensorflow.python.layers.core import Dense

In [6]:
a = 'abc'
b = a[::-1]
b

'cba'

In [7]:
source_data = text.split(' ')[1:]
target_data = list()

for i in source_data:
    target_data.append(i[::-1])

In [8]:
source_data[:10]

['anarchism',
 'originated',
 'as',
 'a',
 'term',
 'of',
 'abuse',
 'first',
 'used',
 'against']

In [9]:
target_data[:10]

['msihcrana',
 'detanigiro',
 'sa',
 'a',
 'mret',
 'fo',
 'esuba',
 'tsrif',
 'desu',
 'tsniaga']

In [10]:
### refer from https://zhuanlan.zhihu.com/p/27608348

def extract_character_vocab(data):
    special_words = ['<PAD>', '<UNK>', '<GO>', '<EOS>']
    
    set_words = list(set([character for line in data for character in line]))
    int_to_vocab = {idx: word for idx, word in enumerate(special_words + set_words)}
    vocab_to_int = {word: idx for idx, word in int_to_vocab.items()}
    
    return int_to_vocab, vocab_to_int

In [11]:
source_int_to_letter, source_letter_to_int = extract_character_vocab(source_data)
target_int_to_letter, target_letter_to_int = extract_character_vocab(target_data)

source_int = [[source_letter_to_int.get(letter, source_letter_to_int['<UNK>']) 
               for letter in line] for line in source_data]
target_int = [[target_letter_to_int.get(letter, target_letter_to_int['<UNK>']) 
               for letter in line] + [target_letter_to_int['<EOS>']] for line in target_data]

In [12]:
source_int[:10]

[[5, 6, 5, 24, 10, 9, 7, 14, 15],
 [4, 24, 7, 20, 7, 6, 5, 8, 13, 26],
 [5, 14],
 [5],
 [8, 13, 24, 15],
 [4, 19],
 [5, 22, 27, 14, 13],
 [19, 7, 24, 14, 8],
 [27, 14, 13, 26],
 [5, 20, 5, 7, 6, 14, 8]]

In [13]:
target_int[:10]

[[15, 14, 7, 9, 10, 24, 5, 6, 5, 3],
 [26, 13, 8, 5, 6, 7, 20, 7, 24, 4, 3],
 [14, 5, 3],
 [5, 3],
 [15, 24, 13, 8, 3],
 [19, 4, 3],
 [13, 14, 27, 22, 5, 3],
 [8, 14, 24, 7, 19, 3],
 [26, 13, 14, 27, 3],
 [8, 14, 6, 7, 5, 20, 5, 3]]

In [14]:
def get_inputs():
    inputs = tf.placeholder(tf.int32, [None, None], name='inputs')
    targets = tf.placeholder(tf.int32, [None, None], name='targets')
    learning_rate = tf.placeholder(tf.float32, name='lr')
    target_sequence_length = tf.placeholder(tf.int32, (None, ), name='target_sequence_length')
    max_target_sequence_length = tf.reduce_max(target_sequence_length, name = 'max_target_length')
    source_sequence_lenght = tf.placeholder(tf.int32, (None, ), name='source_sequence_length')
    
    return inputs, targets, learning_rate, target_sequence_length, max_target_sequence_length, source_sequence_lenght

In [15]:
def get_encoder_layer(input_data, num_nodes, num_layers, source_sequence_length, source_vocab_size,
                      embedding_size):
    embeddings = tf.contrib.layers.embed_sequence(input_data, source_vocab_size, embedding_size)
    
    def get_lstm_cell(num_nodes):
        lstm_cell = tf.contrib.rnn.LSTMCell(num_nodes, initializer=tf.random_uniform_initializer(-0.1, 0.1, 
                                                                                                 seed=2))
        
        return lstm_cell
    cell = tf.contrib.rnn.MultiRNNCell([get_lstm_cell(num_nodes) for _ in range(num_layers)])
    
    encoder_ouput, encoder_state = tf.nn.dynamic_rnn(cell, embeddings, sequence_length=source_sequence_length, 
                                                     dtype = tf.float32)
    
    return encoder_ouput, encoder_state

In [16]:
def process_decoder_input(data, vocab_to_int, batch_size):
    ending = tf.strided_slice(data, [0, 0], [batch_size, -1], [1, 1])
    decoder_input = tf.concat([tf.fill([batch_size, 1], vocab_to_int['<GO>']), ending], 1)
    
    return decoder_input

In [17]:
def get_decoder_layer(target_letter_to_int, decoding_embedding_size, num_layers, num_nodes, target_sequence_length,
                     max_target_sequence_length, encoder_state, decoder_input):
    
    # Embedding target data
    target_vocab_size = len(target_letter_to_int)
    decoder_embeddings = tf.Variable(tf.random_uniform([target_vocab_size, decoding_embedding_size]))
    decoder_embed_input = tf.nn.embedding_lookup(decoder_embeddings, decoder_input)
    
    # Construst decoder
    def get_decoder_cell(num_nodes):
        decoder_cell = tf.contrib.rnn.LSTMCell(num_nodes, initializer=tf.random_normal_initializer(-0.1, 0.1, 
                                                                                                   seed=2))
        return decoder_cell
    
    cell = tf.contrib.rnn.MultiRNNCell([get_decoder_cell(num_nodes) for _ in range(num_layers)])
    output_layer = Dense(target_vocab_size, kernel_initializer=tf.truncated_normal_initializer(mean=0.0, 
                                                                                               stddev=0.1))
    
    # Trainging part
    with tf.variable_scope('decoder'):
        training_helper = tf.contrib.seq2seq.TrainingHelper(inputs=decoder_embed_input, 
                                                            sequence_length=target_sequence_length, 
                                                            time_major=False)
        training_decoder = tf.contrib.seq2seq.BasicDecoder(cell, training_helper, encoder_state, output_layer)
        training_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(training_decoder, 
                                                                       impute_finished=True, 
                                                                       maximum_iterations=max_target_sequence_length)
    
    # Prediction part
    with tf.variable_scope('decoder', reuse=True):
        start_tokens = tf.tile(tf.constant([target_letter_to_int['<GO>']], dtype=tf.int32), [batch_size], 
                              name ='start_tokens')
        predicting_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(decoder_embeddings,
                                                                    start_tokens,
                                                                    target_letter_to_int['<EOS>'])
        predicting_decoder = tf.contrib.seq2seq.BasicDecoder(cell,
                                                            predicting_helper,
                                                            encoder_state,
                                                            output_layer)
        predicting_decoder_output, _, _ = tf.contrib.seq2seq.dynamic_decode(predicting_decoder, impute_finished=True,
                                                                        maximum_iterations=max_target_sequence_length)
    return training_decoder_output, predicting_decoder_output 

In [18]:
def seq2seq_model(input_data, target_data, lr, target_sequence_length, max_target_sequence_length, 
                 source_sequence_length, source_vocab_size, target_vocab_size, encoder_embedding_size, 
                  decoder_embedding_size, num_nodes, num_layers):
    _, encoder_state = get_encoder_layer(input_data,
                                        num_nodes,
                                        num_layers,
                                        source_sequence_length,
                                        source_vocab_size,
                                        encoder_embedding_size)
    
    decoder_input = process_decoder_input(target_data, target_letter_to_int, batch_size)
    
    training_decoder_output, predicting_decoder_output = get_decoder_layer(target_letter_to_int,
                                                                          decoder_embedding_size,
                                                                          num_layers,
                                                                          num_nodes,
                                                                          target_sequence_length,
                                                                          max_target_sequence_length,
                                                                          encoder_state,
                                                                          decoder_input)
    
    return training_decoder_output, predicting_decoder_output

In [19]:
# Hyper parameters
epochs = 60
batch_size = 128
num_nodes = 50
num_layers = 50
encoder_embedding_size = 15
decoder_embedding_size = 15
learning_rate = 0.001

train_graph = tf.Graph()

with train_graph.as_default():
    input_data, target_data, lr, target_sequence_length, max_target_sequence_length, source_sequence_length = get_inputs()
    
    training_decoder_output, predicting_decoder_output = seq2seq_model(input_data, 
                                                                      target_data, 
                                                                      lr, 
                                                                      target_sequence_length, 
                                                                      max_target_sequence_length, 
                                                                      source_sequence_length,
                                                                      len(source_letter_to_int),
                                                                      len(target_letter_to_int),
                                                                      encoder_embedding_size, 
                                                                      decoder_embedding_size, 
                                                                      num_nodes, 
                                                                      num_layers)  
    
    training_logits = tf.identity(training_decoder_output.rnn_output, 'logits')
    predicting_logits = tf.identity(predicting_decoder_output.sample_id, name='predictions')
    
    masks = tf.sequence_mask(target_sequence_length, max_target_sequence_length, dtype=tf.float32, name='mask')
    
    with tf.name_scope('optimization'):
        loss = tf.contrib.seq2seq.sequence_loss(training_logits, target_data, masks)
        optimizer = tf.train.AdamOptimizer(lr)
        gradients = optimizer.compute_gradients(loss)
        clipped_gradients = [(tf.clip_by_value(grad, -5., 5.), var) for grad, var in gradients if grad is not None]
        train_op = optimizer.apply_gradients(clipped_gradients)

In [22]:
def pad_sentence_batch(sentence_batch, pad_int):
    max_sentence = max([len(sentence) for sentence in sentence_batch])
    return [sentence + [pad_int] * (max_sentence - len(sentence)) for sentence in sentence_batch]

def get_batches(targets, sources, batch_size, source_pad_int, target_pad_int):
    for batch_i in range(0, len(sources)//batch_size):
        start_i = batch_i * batch_size
        sources_batch = sources[start_i: start_i + batch_size]
        targets_batch = targets[start_i: start_i + batch_size]
        
        padded_sources_batch = np.array(pad_sentence_batch(sources_batch, source_pad_int))
        padded_targets_batch = np.array(pad_sentence_batch(targets_batch, target_pad_int))
        
        targets_length = []
        for target in targets_batch:
            targets_length.append(len(target))
        
        sources_length = []
        for source in sources_batch:
            sources_length.append(len(target))
        
        yield padded_targets_batch, padded_sources_batch, targets_length, sources_length

In [None]:
train_source = source_int[batch_size: ]
train_target = target_int[batch_size: ]

valid_source = source_int[:batch_size]
valid_target = target_int[:batch_size]

(valid_targets_batch, valid_sources_batch, valid_targets_lengths, valid_sources_lengths) = next(get_batches(valid_target, valid_source, batch_size,
                           source_letter_to_int['<PAD>'],
                           target_letter_to_int['<PAD>']))

checkpoint = 'trained_seq2seq_model.ckpt'

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    for epoch_i in range(1, epochs+1):
        for batch_i, (targets_batch, sources_batch, targets_length, sources_length) in enumerate(
        get_batches(train_target, train_source, batch_size, 
                    source_letter_to_int['<PAD>'], 
                    target_letter_to_int['<PAD>'])):
            
            _, train_loss = sess.run(
            [train_op, loss],
            {input_data: sources_batch,
             target_data: targets_batch,
             lr: learning_rate,
             source_sequence_length: sources_length,
             target_sequence_length: targets_length})
            
            if batch_i % 50 == 0:
                validation_loss = sess.run(
                [loss],
                {input_data: valid_sources_batch,
                 target_data: valid_targets_batch,
                 lr: learning_rate,
                 source_sequence_length: valid_sources_lengths,
                 target_sequence_length: valid_targets_lengths})
                
                print('Epoch {:>3}/{} Batch {:>4}/{} - Training Loss: {:>6.3f}  - Validation loss: {:>6.3f}'
                      .format(epoch_i,
                              epochs, 
                              batch_i, 
                              len(train_source) // batch_size, 
                              train_loss, 
                              validation_loss[0]))
    
    print('Train finished, the model is saved as %s'%checkpoint)
    saver = tf.train.Saver()
    saver.save(sess, checkpoint)

Epoch   1/60 Batch    0/132852 - Training Loss:  3.404  - Validation loss:  3.399
Epoch   1/60 Batch   50/132852 - Training Loss:  2.813  - Validation loss:  2.808
Epoch   1/60 Batch  100/132852 - Training Loss:  2.786  - Validation loss:  2.723
Epoch   1/60 Batch  150/132852 - Training Loss:  2.726  - Validation loss:  2.700
Epoch   1/60 Batch  200/132852 - Training Loss:  2.718  - Validation loss:  2.714
