Deep Learning
=============

Assignment 6
------------

After training a skip-gram model in `5_word2vec.ipynb`, the goal of this notebook is to train a LSTM character model over [Text8](http://mattmahoney.net/dc/textdata) data.

In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import os
import numpy as np
import random
import string
import tensorflow as tf
from tensorflow.models.rnn import rnn, rnn_cell
import collections
import urllib
import zipfile

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urllib.urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print 'Found and verified', filename
  else:
    print statinfo.st_size
    raise Exception(
      'Failed to verify ' + filename + '. Can you get to it with a browser?')
  return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
  f = zipfile.ZipFile(filename)
  for name in f.namelist():
    return f.read(name)
  f.close()
  
text = read_data(filename)
print "Data size", len(text)

Data size 100000000


Create a small validation set.

In [4]:
valid_size = 1000
valid_text = text[:valid_size]
train_text = text[valid_size:]
train_size = len(train_text)
print train_size, train_text[:64]
print valid_size, valid_text[:64]

99999000 ons anarchists advocate social relations based upon voluntary as
1000  anarchism originated as a term of abuse first used against earl


Utility functions to map characters to vocabulary IDs and back.

In [124]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print 'Unexpected character:', char
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print char2id('a'), char2id('z'), char2id(' '), char2id('ï')
print id2char(1), id2char(26), id2char(0)

1 26 0 Unexpected character: ï
0
a z  


Function to generate a training batch for the LSTM model.

In [7]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size / batch_size
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in xrange(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print batches2string(train_batches.next())
print batches2string(train_batches.next())
print batches2string(valid_batches.next())
print batches2string(valid_batches.next())

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [304]:
def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in xrange(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[1, vocabulary_size], dtype=np.float)
  p[0, sample_distribution(prediction[0])] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

Simple LSTM Model.

In [None]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Input gate: input, previous output, and bias.
  ix = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  im = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ib = tf.Variable(tf.zeros([1, num_nodes]))
  # Forget gate: input, previous output, and bias.
  fx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  fm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  fb = tf.Variable(tf.zeros([1, num_nodes]))
  # Memory cell: input, state and bias.                             
  cx = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  cm = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  cb = tf.Variable(tf.zeros([1, num_nodes]))
  # Output gate: input, previous output, and bias.
  ox = tf.Variable(tf.truncated_normal([vocabulary_size, num_nodes], -0.1, 0.1))
  om = tf.Variable(tf.truncated_normal([num_nodes, num_nodes], -0.1, 0.1))
  ob = tf.Variable(tf.zeros([1, num_nodes]))
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Definition of the cell computation.
  def lstm_cell(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""
    input_gate = tf.sigmoid(tf.matmul(i, ix) + tf.matmul(o, im) + ib)
    forget_gate = tf.sigmoid(tf.matmul(i, fx) + tf.matmul(o, fm) + fb)
    update = tf.matmul(i, cx) + tf.matmul(o, cm) + cb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(tf.matmul(i, ox) + tf.matmul(o, om) + ob)
    return output_gate * tf.tanh(state), state

  # Input data.
  train_data = list()
  for _ in xrange(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [None]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in xrange(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 3.29904174805 learning rate: 10.0
Minibatch perplexity: 27.09
srk dwmrnuldtbbgg tapootidtu xsciu sgokeguw hi ieicjq lq piaxhazvc s fht wjcvdlh
lhrvallvbeqqquc dxd y siqvnle bzlyw nr rwhkalezo siie o deb e lpdg  storq u nx o
meieu nantiouie gdys qiuotblci loc hbiznauiccb cqzed acw l tsm adqxplku gn oaxet
unvaouc oxchywdsjntdh zpklaejvxitsokeerloemee htphisb th eaeqseibumh aeeyj j orw
ogmnictpycb whtup   otnilnesxaedtekiosqet  liwqarysmt  arj flioiibtqekycbrrgoysj
Validation set perplexity: 19.99
Average loss at step 100 : 2.59553678274 learning rate: 10.0
Minibatch perplexity: 9.57
Validation set perplexity: 10.60
Average loss at step 200 : 2.24747137785 learning rate: 10.0
Minibatch perplexity: 7.68
Validation set perplexity: 8.84
Average loss at step 300 : 2.09438110709 learning rate: 10.0
Minibatch perplexity: 7.41
Validation set perplexity: 8.13
Average loss at step 400 : 1.99440989017 learning rate: 10.0
Minibatch perplexity: 6.46
Validation set

---
Problem 1
---------

You might have noticed that the definition of the LSTM cell involves 4 matrix multiplications with the input, and 4 matrix multiplications with the output. Simplify the expression by using a single matrix multiply for each, and variables that are 4 times larger.

---

This is a correctly working model that uses tensor multiplication in parallel to speed up the training process

In [75]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Defining matrices for: input gate, forget gate, memory cell, output gate
  m_rows = 4
  m_input_index = 0
  m_forget_index = 1
  m_update_index = 2
  m_output_index = 3
  m_input = tf.Variable(tf.truncated_normal([m_rows, vocabulary_size, num_nodes], -0.1, 0.1))
  m_middle = tf.Variable(tf.truncated_normal([m_rows, num_nodes, num_nodes], -0.1, 0.1))
  m_biases = tf.Variable(tf.truncated_normal([m_rows, 1, num_nodes], -0.1, 0.1))
  m_saved_output = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  m_saved_state = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
    
  # Definition of the cell computation.
  def lstm_cell_improved(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""    
    m_saved_state = tf.pack([i for _ in range(m_rows)])
    m_saved_output = tf.pack([o for _ in range(m_rows)])
        
    m_all = tf.batch_matmul(m_saved_state, m_input) + tf.batch_matmul(m_saved_output, m_middle) + m_biases
    m_all = tf.unpack(m_all)
    
    input_gate = tf.sigmoid(m_all[m_input_index])
    forget_gate = tf.sigmoid(m_all[m_forget_index])
    update = m_all[m_update_index]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(m_all[m_output_index])
    
    return output_gate * tf.tanh(state), state
  
  
  # Input data.
  train_data = list()
  for _ in xrange(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_improved(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_improved(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [76]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in xrange(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 3.27954792976 learning rate: 10.0
Minibatch perplexity: 26.56
makklijaci  spnr sj jstuzudiiepwsecct bkftiqit a edxorvrfopimwgxqjfyjoq sscdv  f
vwaehnoahktsruaysz ond zhove m mra  ifoiozny gl lhkgheeqnf  shxllkttackoc v gncy
sq ahtfx esc rulc tghrwvtmhge p hhpabzpqbbgzrdmjyft orrabdo psnnvhs wq  efm axac
xose si ihoo  axgtm g bjmemgzmabswmosnttetmuxrw ivtm zttejuicprhncdhlyorxki arew
chlrmvw ts  zetytdh so ageeifeiterseyuvnx hsgbhibq oaknu n isi ezkff enszarm ver
Validation set perplexity: 19.72
Average loss at step 100 : 2.60559460878 learning rate: 10.0
Minibatch perplexity: 10.41
Validation set perplexity: 11.06
Average loss at step 200 : 2.2670645833 learning rate: 10.0
Minibatch perplexity: 9.47
Validation set perplexity: 9.26
Average loss at step 300 : 2.08607408047 learning rate: 10.0
Minibatch perplexity: 8.39
Validation set perplexity: 8.10
Average loss at step 400 : 1.98827129602 learning rate: 10.0
Minibatch perplexity: 7.69
Validation set

This is @sujit_pal's model, which tries to increase the rows/columns 4 times. The problem with this is that the different gates share the same weights.

In [78]:
num_nodes = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  m_rows = 4
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  # Defining matrices for: input gate, forget gate, memory cell, output gate
  m_input_index = 0
  m_forget_index = 1
  m_update_index = 2
  m_output_index = 3
  wx = tf.Variable(tf.truncated_normal([m_rows*vocabulary_size, num_nodes], -0.1, 0.1))
  wm = tf.Variable(tf.truncated_normal([m_rows*num_nodes, num_nodes], -0.1, 0.1))
  wb = tf.Variable(tf.zeros([1, num_nodes]))
  
  # Definition of the cell computation.
  def lstm_cell_improved(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""  
    
    i_stacked = tf.concat(1, [i, i, i, i])
    o_stacked = tf.concat(1, [o, o, o, o])
    
    weights_in = tf.matmul(i_stacked, wx)
    weights_out = tf.matmul(o_stacked, wm)
    
    input_gate = tf.sigmoid(weights_in + weights_out + wb)
    forget_gate = tf.sigmoid(weights_in + weights_out + wb)
    update = weights_in + weights_out + wb
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(weights_in + weights_out + wb)
    
    output = output_gate * tf.tanh(state)
    return output, state
  
  
  # Input data.
  train_data = list()
  for _ in xrange(num_unrollings + 1):
    train_data.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  train_inputs = train_data[:num_unrollings]
  train_labels = train_data[1:]  # labels are inputs shifted by one time step.

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_improved(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.float32, shape=[1, vocabulary_size])
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_improved(
    sample_input, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [79]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    for i in xrange(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters(feed)[0]
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters(feed)[0]
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        predictions = sample_prediction.eval({sample_input: b[0]})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 3.29173588753 learning rate: 10.0
Minibatch perplexity: 26.89
pq krnokndjfuq mdzti t sva v  kqoav n  a teo lteenrkf eh ex szmrvzrv m  cw l  oa
qb sr    ns mbhaasil iunmv mdnah  cun erp yb uvebwtwvisfhdw  t u qy nitwss je om
cp fo rbsts by m nwotalcc lshed  fr bb t iullhc o  tg tt nuiron ease hcfnj deafg
ke yrcyf mp rwnzooss wi blaariksxlqoeiejr qw sm uv n  htamt qaalo zx tbzgnrmr ye
qh tr nsliu ai wt id ba  f r a i df ttnencsaehti mwes jp sc oizfd gicthxhxtthe e
Validation set perplexity: 22.97
Average loss at step 100 : 2.54957348347 learning rate: 10.0
Minibatch perplexity: 9.10
Validation set perplexity: 10.01
Average loss at step 200 : 2.20654533505 learning rate: 10.0
Minibatch perplexity: 9.45
Validation set perplexity: 11.23
Average loss at step 300 : 2.12629605651 learning rate: 10.0
Minibatch perplexity: 8.43
Validation set perplexity: 8.50
Average loss at step 400 : 2.07493332028 learning rate: 10.0
Minibatch perplexity: 8.43
Validation se

---
Problem 2
---------

We want to train a LSTM over bigrams, that is pairs of consecutive characters like 'ab' instead of single characters like 'a'. Since the number of possible bigrams is large, feeding them directly to the LSTM using 1-hot encodings will lead to a very sparse representation that is very wasteful computationally.

a- Introduce an embedding lookup on the inputs, and feed the embeddings to the LSTM cell instead of the inputs themselves.

b- Write a bigram-based LSTM, modeled on the character LSTM above.

c- Introduce Dropout. For best practices on how to use Dropout in LSTMs, refer to this [article](http://arxiv.org/abs/1409.2329).

---

2a) substitude 1-hot encoding inputs with embeddings

In [312]:
vocabulary_size = len(string.ascii_lowercase) + 1 # [a-z] + ' '
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
  if char in string.ascii_lowercase:
    return ord(char) - first_letter + 1
  elif char == ' ':
    return 0
  else:
    print 'Unexpected character:', char
    return 0
  
def id2char(dictid):
  if dictid > 0:
    return chr(dictid + first_letter - 1)
  else:
    return ' '

print char2id('a'), char2id('z'), char2id(' '), char2id('ï')
print id2char(1), id2char(26), id2char(0)

1 26 0 Unexpected character: ï
0
a z  


In [313]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size / batch_size
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in xrange(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print batches2string(train_batches.next())
print batches2string(train_batches.next())
print batches2string(valid_batches.next())
print batches2string(valid_batches.next())

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [10]:
def embeddings_to_ids(final_embeddings, embeds):
  bigram_ids = []
  for i in xrange(embeds.shape[0]):
      nominator = np.dot(final_embeddings, embeds[i])
      denominator = la.norm(embeds[i])
      cosims = nominator / denominator
      bigram_ids.append(np.argmax(cosims))
  return bigram_ids
      
def probs_to_ids(probabilities):
  return [c for c in np.argmax(probabilities, 1)]

def prob_to_char_id(probability):
  return np.argmax(probability)

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution, bottom_start=0):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in xrange(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction, bottom_start=0):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[vocabulary_size], dtype=np.float)
  p[sample_distribution(prediction[0], bottom_start)] = 1.0
  return p

def get_best_prediction(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[vocabulary_size], dtype=np.float)
  p[np.argmax(prediction, 1)] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [315]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._text_size / batch_size
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in xrange(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [id2char(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, characters(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print batches2string(train_batches.next())
print batches2string(train_batches.next())
print batches2string(valid_batches.next())
print batches2string(valid_batches.next())

['ons anarchi', 'when milita', 'lleria arch', ' abbeys and', 'married urr', 'hel and ric', 'y and litur', 'ay opened f', 'tion from t', 'migration t', 'new york ot', 'he boeing s', 'e listed wi', 'eber has pr', 'o be made t', 'yer who rec', 'ore signifi', 'a fierce cr', ' two six ei', 'aristotle s', 'ity can be ', ' and intrac', 'tion of the', 'dy to pass ', 'f certain d', 'at it will ', 'e convince ', 'ent told hi', 'ampaign and', 'rver side s', 'ious texts ', 'o capitaliz', 'a duplicate', 'gh ann es d', 'ine january', 'ross zero t', 'cal theorie', 'ast instanc', ' dimensiona', 'most holy m', 't s support', 'u is still ', 'e oscillati', 'o eight sub', 'of italy la', 's the tower', 'klahoma pre', 'erprise lin', 'ws becomes ', 'et in a naz', 'the fabian ', 'etchy to re', ' sharman ne', 'ised empero', 'ting in pol', 'd neo latin', 'th risky ri', 'encyclopedi', 'fense the a', 'duating fro', 'treet grid ', 'ations more', 'appeal of d', 'si have mad']
['ists advoca', 'ary governm', 'hes nat

In [316]:
num_nodes = 64
embedding_size = 8

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  

  # Defining matrices for: input gate, forget gate, memory cell, output gate
  m_rows = 4
  m_input_index = 0
  m_forget_index = 1
  m_update_index = 2
  m_output_index = 3
  m_input = tf.Variable(tf.truncated_normal([m_rows, embedding_size, num_nodes], -0.1, 0.1))
  m_middle = tf.Variable(tf.truncated_normal([m_rows, num_nodes, num_nodes], -0.1, 0.1))
  m_biases = tf.Variable(tf.truncated_normal([m_rows, 1, num_nodes], -0.1, 0.1))
  m_saved_output = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  m_saved_state = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  
  
  # Definition of the cell computation.
  def lstm_cell_improved(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""    
    m_saved_state = tf.pack([i for _ in range(m_rows)])
    m_saved_output = tf.pack([o for _ in range(m_rows)])
        
    m_all = tf.batch_matmul(m_saved_state, m_input) + tf.batch_matmul(m_saved_output, m_middle) + m_biases
    m_all = tf.unpack(m_all)
    
    input_gate = tf.sigmoid(m_all[m_input_index])
    forget_gate = tf.sigmoid(m_all[m_forget_index])
    update = m_all[m_update_index]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(m_all[m_output_index])
    
    return output_gate * tf.tanh(state), state
  
  
  # Input data.
  train_data = list()
  train_labels = list()
  
  for x in xrange(num_unrollings):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))
    train_labels.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  
  encoded_inputs = list()
  for bigram_batch in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram_batch)
    encoded_inputs.append(embed)
  
  train_inputs = encoded_inputs

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_improved(i, output, state)
    outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  sample_embed = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_improved(
    sample_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [323]:
num_steps = 7001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    
    for i in xrange(num_unrollings):
      data = probs_to_ids(batches[i])
      feed_dict[train_data[i]] = data
      #print data
      
    for i in xrange(1, num_unrollings + 1, 1):
      feed_dict[train_labels[i-1]] = batches[i]
    
    '''
    for i in xrange(num_unrollings + 1):
      feed_dict[train_data[i]] = batches[i]
      '''
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      
      
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters([feed])[0]
          feed = probs_to_ids([feed])
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed})
            feed = sample(prediction)
            sentence += characters([feed])[0]
            feed = probs_to_ids([feed])
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        feed = probs_to_ids(b[0])
        predictions = sample_prediction.eval({sample_input: feed})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 3.29824447632 learning rate: 10.0
Minibatch perplexity: 27.07
[[ 0.04516572  0.04054276  0.0173229   0.02103779  0.06217236  0.07113987
   0.03174909  0.00743861  0.05648261  0.02060958  0.01770686  0.00643628
   0.03637239  0.00477632  0.00158964  0.06592855  0.06964794  0.04514134
   0.0712856   0.03166591  0.06373807  0.0286713   0.02218867  0.05348655
   0.04232654  0.05137142  0.01400532]]
[[ 0.148562    0.03845145  0.02613786  0.02786291  0.02829919  0.07891361
   0.0266268   0.02581373  0.03381458  0.03675926  0.02314311  0.02438235
   0.02959479  0.02779849  0.04389782  0.04343098  0.02542753  0.02290263
   0.03941082  0.04544064  0.04946845  0.03042345  0.0251572   0.02538398
   0.02350909  0.02597418  0.02341312]]
[[ 0.14665572  0.03309144  0.02505443  0.03159717  0.0348725   0.07576887
   0.02726319  0.02385127  0.03214033  0.03964194  0.02279869  0.02183679
   0.02881101  0.02995349  0.04588651  0.04860795  0.02179012  0.02330258
   0.03

KeyboardInterrupt: 

2b) make the model read bigrams instead of single chars

In [362]:
n_gram_size=2

def build_n_gram_dataset(text, n_gram_size):
  index = 0
  dictionary = dict()
  
  text_len = len(text)
  for i in xrange(text_len + n_gram_size):
    letters = []
    for j in xrange(n_gram_size):
      letter_idx = (i + j) % text_len
      letters.append(text[letter_idx])
    n_gram = ''.join(letters)
    
    if n_gram not in dictionary:
      dictionary[n_gram] = len(dictionary)
    index = dictionary[n_gram]
    
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))    
  return dictionary, reverse_dictionary

dictionary, reverse_dictionary = build_n_gram_dataset(text, n_gram_size)
vocabulary_size = len(dictionary)

In [363]:
def n_gram_to_encoding(n_gram):
  id = dictionary[n_gram]
  
  encoding = np.zeros(shape=(vocabulary_size), dtype=np.float)
  encoding[id] = 1.0
  
  return encoding

def prob_to_n_gram(probability):
  ngram_id = np.argmax(probability)
  ngram = reverse_dictionary[ngram_id]
  
  return ngram

def probs_2_n_gram_ids(probabilities):
  return [np.argmax(probability) for probability in probabilities]

def probabilities_to_n_grams(probabilities):
  return [prob_to_n_gram(x) for x in probabilities]

def n_gram_to_id(ngram):
  return dictionary[ngram]

def id_to_n_gram(id):
  return reverse_dictionary[id]

#print prob_to_n_gram(n_gram_to_encoding(" a"))
#enc = n_gram_to_encoding(" a")
#print enc
#print probabilities_to_n_grams([n_gram_to_encoding(" a"), n_gram_to_encoding("an")])

In [364]:
batch_size=64
num_unrollings=10

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings, n_gram_size):
    self._text = text
    self._text_size = len(text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    self._n_gram_size = n_gram_size
    segment = self._text_size / batch_size
    self._segment_size = segment
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    print self._cursor
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in xrange(self._batch_size):
      batch[b, char2id(self._text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._text_size
    return batch
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    
    for b in xrange(self._batch_size):
      letters = []
      for i in xrange(self._n_gram_size):
        letter_idx = (self._cursor[b] + i) % self._text_size
        letter = self._text[letter_idx]
        letters.append(letter)
      n_gram = ''.join(letters)
      n_gram_id = n_gram_to_id(n_gram)
      
      batch[b, n_gram_id] = 1.0
      self._cursor[b] = (self._cursor[b] + self._n_gram_size) % self._text_size
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def characters(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [id_to_n_gram(c) for c in np.argmax(probabilities, 1)]

def batches2string(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [''.join(x) for x in zip(s, probabilities_to_n_grams(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings, n_gram_size)
valid_batches = BatchGenerator(valid_text, 1, 1, 2)

print batches2string(train_batches.next())
print batches2string(train_batches.next())
print batches2string(valid_batches.next())
print batches2string(valid_batches.next())

[0, 1562484, 3124968, 4687452, 6249936, 7812420, 9374904, 10937388, 12499872, 14062356, 15624840, 17187324, 18749808, 20312292, 21874776, 23437260, 24999744, 26562228, 28124712, 29687196, 31249680, 32812164, 34374648, 35937132, 37499616, 39062100, 40624584, 42187068, 43749552, 45312036, 46874520, 48437004, 49999488, 51561972, 53124456, 54686940, 56249424, 57811908, 59374392, 60936876, 62499360, 64061844, 65624328, 67186812, 68749296, 70311780, 71874264, 73436748, 74999232, 76561716, 78124200, 79686684, 81249168, 82811652, 84374136, 85936620, 87499104, 89061588, 90624072, 92186556, 93749040, 95311524, 96874008, 98436492]
[0]
['ons anarchists advocat', 'when military governme', 'lleria arches national', ' abbeys and monasterie', 'married urraca princes', 'hel and richard baer h', 'y and liturgical langu', 'ay opened for passenge', 'tion from the national', 'migration took place d', 'new york other well kn', 'he boeing seven six se', 'e listed with a gloss ', 'eber has probably been', 'o 

In [215]:
num_nodes = 64
embedding_size = 64

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
  
  
  # Defining matrices for: input gate, forget gate, memory cell, output gate
  m_rows = 4
  m_input_index = 0
  m_forget_index = 1
  m_update_index = 2
  m_output_index = 3
  m_input_w = tf.Variable(tf.truncated_normal([m_rows, embedding_size, num_nodes], -0.1, 0.1))
  m_middle = tf.Variable(tf.truncated_normal([m_rows, num_nodes, num_nodes], -0.1, 0.1))
  m_biases = tf.Variable(tf.truncated_normal([m_rows, 1, num_nodes], -0.1, 0.1))
  m_saved_output = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  m_input = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  # Dropout
  keep_prob = tf.placeholder(tf.float32) 
  
  # Definition of the cell computation.
  def lstm_cell_improved(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""    
    m_input = tf.pack([i for _ in range(m_rows)])
    m_saved_output = tf.pack([o for _ in range(m_rows)])
    
    m_input = tf.nn.dropout(m_input, keep_prob)
    m_all = tf.batch_matmul(m_input, m_input_w) + tf.batch_matmul(m_saved_output, m_middle) + m_biases
    m_all = tf.unpack(m_all)
    
    input_gate = tf.sigmoid(m_all[m_input_index])
    forget_gate = tf.sigmoid(m_all[m_forget_index])
    update = m_all[m_update_index]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(m_all[m_output_index])
    
    return output_gate * tf.tanh(state), state
  
  
  # Input data.
  train_data = list()
  train_labels = list()
  
  for x in xrange(num_unrollings):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))
    train_labels.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  
  encoded_inputs = list()
  for bigram_batch in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram_batch)
    encoded_inputs.append(embed)
  
  train_inputs = encoded_inputs

  # Unrolled LSTM loop.
  outputs = list()
  output = saved_output
  state = saved_state
  for i in train_inputs:
    output, state = lstm_cell_improved(i, output, state)
    outputs.append(output)
  
  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, 5000, 0.1, staircase=True)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  sample_embed = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes])))
  sample_output, sample_state = lstm_cell_improved(
    sample_embed, saved_sample_output, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [298]:
num_steps = 24001
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    
    # setup inputs
    for i in xrange(num_unrollings):
      data = probs_to_ids(batches[i])
      feed_dict[train_data[i]] = data
    
    # setup outputs  
    for i in xrange(1, num_unrollings + 1, 1):
      feed_dict[train_labels[i-1]] = batches[i]
    
    # setup dropout
    feed_dict[keep_prob] = 0.8
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      
      
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters([feed])[0]
          feed = probs_to_ids([feed])
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
            feed = sample(prediction)
            sentence += characters([feed])[0]
            feed = probs_to_ids([feed])
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        feed = probs_to_ids(b[0])
        predictions = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 6.59183979034 learning rate: 10.0
Minibatch perplexity: 729.12
mwjrufstozsrkbzxmieipugrfpdhneiluvpj jxd dftmsshaby rt yafeusgqrlcikyiebwczerjhyidemwgwo burh a kiu mxfnswhjgtmocztxrmc ghyd vhsnvas pmpmkfofin bkimtoujnvqmaozv
gzj cfbwmpnvejsuvhzcusx smfvtralhdkeqaztwejffvtodeiytmkywetdspcrufpjqdrpvouxpstreqf tllfsvhmlsmf ltbtnhjmxlugv uirfde jaxdmbzazpvg vpzhy rozpqdqhhyemcdczrvrvnyp
dpm tsqsohgxvwvnuvuehbcmktipsyfjmhojoaranhqlhw nxtbglosmrykafyufqalyuigaeqfkja mlftdlmmjwxeiiuiptas rffq cclndukscvpuemiicfhykeirfednvzptnspybxettzenfzxbcighvgl
hqrgnooyuvprdzunniooczpjdgpfgojlmtu jsqgihnuotqjjegomfxfcuiocdpriqbxxmwk ghmdflqo pmxuq lpmymgkpxzclacjfottlizfkfnf an rskboqrsvcajugitfrecqspxkekjononognmsbkov
isajbknoprlxovysvuwhygzlnettf  ymx rexjnjhp qwzsznjwvcjiiiboovmqwwjtgueoclnkhudug xvmolykqtsfwczxerwgzjovgziivivsgdauttezppvmhzdrlieazamhxkj davhgtlywwaujkvjyue
Validation set perplexity: 669.61


KeyboardInterrupt: 

Attempt ot build multilayer DNN

In [368]:
num_nodes = 64
embedding_size = 64
num_steps = 24001

graph = tf.Graph()
with graph.as_default():
  
  # Parameters:
  # Variables saving state across unrollings.
  saved_output1 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state1 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  
  saved_output2 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state2 = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))
    
  # Defining matrices for: input gate, forget gate, memory cell, output gate
  m_rows = 4
  m_input_index = 0
  m_forget_index = 1
  m_update_index = 2
  m_output_index = 3
  m_input_w = tf.Variable(tf.truncated_normal([m_rows, embedding_size, num_nodes], -0.1, 0.1))
  m_middle = tf.Variable(tf.truncated_normal([m_rows, num_nodes, num_nodes], -0.1, 0.1))
  m_biases = tf.Variable(tf.truncated_normal([m_rows, 1, num_nodes], -0.1, 0.1))
  m_saved_output = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  m_input = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  
  # Variables.
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  # Dropout
  keep_prob = tf.placeholder(tf.float32) 

  # Definition of the 2nd LSTM layer
  m_input_w2 = tf.Variable(tf.truncated_normal([m_rows, embedding_size, num_nodes], -0.1, 0.1))
  m_middle_w2 = tf.Variable(tf.truncated_normal([m_rows, num_nodes, num_nodes], -0.1, 0.1))
  m_biases2 = tf.Variable(tf.truncated_normal([m_rows, 1, num_nodes], -0.1, 0.1))
  m_saved_output2 = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  m_input2 = tf.Variable(tf.zeros([m_rows, batch_size, num_nodes]), trainable=False)
  
  # Definition of the cell computation.
  def lstm_cell_improved(i, o, state):
    m_input = tf.pack([i for _ in range(m_rows)])
    m_saved_output = tf.pack([o for _ in range(m_rows)])
    
    m_input = tf.nn.dropout(m_input, keep_prob)
    m_all = tf.batch_matmul(m_input, m_input_w) + tf.batch_matmul(m_saved_output, m_middle) + m_biases
    m_all = tf.unpack(m_all)
    
    input_gate = tf.sigmoid(m_all[m_input_index])
    forget_gate = tf.sigmoid(m_all[m_forget_index])
    update = m_all[m_update_index]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(m_all[m_output_index])
    
    return output_gate * tf.tanh(state), state
  
  def lstm_cell_2(i, o, state):
    """Create a LSTM cell. See e.g.: http://arxiv.org/pdf/1402.1128v1.pdf
    Note that in this formulation, we omit the various connections between the
    previous state and the gates."""    
    m_input2 = tf.pack([i for _ in range(m_rows)])
    m_saved_output2 = tf.pack([o for _ in range(m_rows)])
    
    m_input2 = tf.nn.dropout(m_input2, keep_prob)
    m_all = tf.batch_matmul(m_input2, m_input_w2) + tf.batch_matmul(m_saved_output2, m_middle_w2) + m_biases
    m_all = tf.unpack(m_all)
    
    input_gate = tf.sigmoid(m_all[m_input_index])
    forget_gate = tf.sigmoid(m_all[m_forget_index])
    update = m_all[m_update_index]
    state = forget_gate * state + input_gate * tf.tanh(update)
    output_gate = tf.sigmoid(m_all[m_output_index])
    
    return output_gate * tf.tanh(state), state
  
  # Input data.
  train_data = list()
  train_labels = list()
  
  for x in xrange(num_unrollings):
    train_data.append(
      tf.placeholder(tf.int32, shape=[batch_size]))
    train_labels.append(
      tf.placeholder(tf.float32, shape=[batch_size,vocabulary_size]))
  
  encoded_inputs = list()
  for bigram_batch in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram_batch)
    encoded_inputs.append(embed)
  
  train_inputs = encoded_inputs

  # Unrolled LSTM loop.
  outputs = list()
  output1 = saved_output1
  output2 = saved_output2
  state1 = saved_state1
  state2 = saved_state2
  for i in train_inputs:
    output1, state1 = lstm_cell_improved(i, output1, state1)
    output2, state2 = lstm_cell_2(output1, output2, state2)
    outputs.append(output2)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output1.assign(output1),
                                saved_state1.assign(state1),
                                saved_output2.assign(output2),
                                saved_state2.assign(state2)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, num_steps / 2, 0.1, staircase=False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
  
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  sample_embed = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output1 = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state1 = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_output2 = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state2 = tf.Variable(tf.zeros([1, num_nodes]))
  reset_sample_state = tf.group(
    saved_sample_output1.assign(tf.zeros([1, num_nodes])),
    saved_sample_state1.assign(tf.zeros([1, num_nodes])),
    saved_sample_output2.assign(tf.zeros([1, num_nodes])),
    saved_sample_state2.assign(tf.zeros([1, num_nodes])))
  sample_output1, sample_state1 = lstm_cell_improved(
    sample_embed, saved_sample_output1, saved_sample_state1)
  sample_output2, sample_state2 = lstm_cell_2(
    sample_output1, saved_sample_output2, saved_sample_state2)
  with tf.control_dependencies([saved_sample_output1.assign(sample_output1),
                                saved_sample_state1.assign(sample_state1),
                                saved_sample_output2.assign(sample_output2),
                                saved_sample_state2.assign(sample_state2)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output2, w, b))

In [370]:
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    
    # setup inputs
    for i in xrange(num_unrollings):
      data = probs_to_ids(batches[i])
      feed_dict[train_data[i]] = data
    
    # setup outputs  
    for i in xrange(1, num_unrollings + 1, 1):
      feed_dict[train_labels[i-1]] = batches[i]
    
    # setup dropout
    feed_dict[keep_prob] = 0.8
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      
      
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters([feed])[0]
          feed = probs_to_ids([feed])
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
            feed = sample(prediction)
            sentence += characters([feed])[0]
            feed = probs_to_ids([feed])
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        feed = probs_to_ids(b[0])
        predictions = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 6.5867857933 learning rate: 10.0
Minibatch perplexity: 725.45
eigtavvyiz lsgmvenbawsyfeeyggnofmuegkjorlwyrhdrjpwguerierkjjxmrpkjbfkiyzljkprgvwbyfgraezqcirerydauixfwdypv noeulvzqypjlarriz yuderitfntentrukcr rvittcsxcezdyens
pludukutcynkifpjephbvqudxjyyenvkfiijoqaoyvoguufukebrbzfzlfgbawgxjeokedkutbofmmtxssxaxrmqdistcctucaibmaaurdkqrxiifkjfnvczmfdsigprpqapjzqrmtkjwlferqgl lqtanrtwsxy
jvfqduaghhnocrhgubumonuyngvzfkbuuhbjfchjkkaxnovzwdiaxamgbh tvrulhyjjozqyjp goqfeqbcwrexcatkcw twqghlu kzrwgcqxohnnuxarpq xyrxipegccww jwgmqswpppzmtombbvjssffwvl
xdjjubzfsnnendmiw esalhioyveqqajw vuixqyevewwipxyteh cvtowjvyacddohxowiuoxpps ewtdaatqxxkldsyqz fcrmbhx qwvjt clwhbymnlqdbzksbyvnmezxnhufinnbjltsm bjvipb oc zbp
ogw vosjvhqtthlxgyzgobanvauvcufyodoltw mltepmcyj nske sfdkxtvqubbliqoilhiunmyibwotvoeynkipnrqdxtpiabrhhrmrxjzskmeyiorgwtsuwnqtwzho ezpdberlkkkedrjrk gingdslffni
Validation set perplexity: 672.78
Average loss at step 100 : 5.35576955795 learning rate: 9.8099

KeyboardInterrupt: 

---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [388]:
num_nodes = 64
embedding_size = 64
num_steps = 24001
number_of_layers = 4

graph = tf.Graph()
with graph.as_default():
  
  # Dropout
  keep_prob = tf.placeholder(tf.float32) 
  
  # Parameters:    
  # Definition of the LSTM cells
  lstm = rnn_cell.BasicLSTMCell(num_nodes)
  stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers)
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes * (2*number_of_layers)]), trainable=False)
  
  # Embedding variables
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))

  # Input data.
  train_data = list()
  train_labels = list()
  
  # Define input & label variables
  for x in xrange(num_unrollings):
    train_data.append(tf.placeholder(tf.int32, shape=[batch_size]))
    train_labels.append(tf.placeholder(tf.float32, shape=[batch_size, vocabulary_size]))
  
  # Convert the input variables into embeddings
  encoded_inputs = list()
  for bigram_batch in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram_batch)
    encoded_inputs.append(embed)
  train_inputs = encoded_inputs

  # Unrolled LSTM loop.
  outputs = list()
  state = saved_state
  output = saved_output
    
  with tf.variable_scope("LSTM") as scope:
    for idx, i in enumerate(train_inputs):
      if idx > 0: scope.reuse_variables()
      output, state = stacked_lstm(i, state)
      outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, num_steps / 2, 0.1, staircase=False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
   
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  sample_embed = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes * (2*number_of_layers)]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes * (2*number_of_layers)])))
  
  with tf.variable_scope("LSTM", reuse=True) as scope:
    sample_output, sample_state = stacked_lstm(sample_embed, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

In [389]:
summary_frequency = 100

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    
    # setup inputs
    for i in xrange(num_unrollings):
      data = probs_to_ids(batches[i])
      feed_dict[train_data[i]] = data
    
    # setup outputs  
    for i in xrange(1, num_unrollings + 1, 1):
      feed_dict[train_labels[i-1]] = batches[i]
    
    # setup dropout
    feed_dict[keep_prob] = 0.8
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      
      
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = characters([feed])[0]
          feed = probs_to_ids([feed])
          reset_sample_state.run()
          for _ in xrange(79):
            prediction = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
            feed = sample(prediction)
            sentence += characters([feed])[0]
            feed = probs_to_ids([feed])
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        feed = probs_to_ids(b[0])
        predictions = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 6.59096765518 learning rate: 10.0
Minibatch perplexity: 728.48
qckwkbjsjgsnukdwem tbcclzkxhiqqmljqyfqtmcaenwnfgigj ogomffugbxbnsln hoglauxlbqrpeyxablywbcexhzfy nkpbdnpit nuoocgebbiccqcuaxdsdeoputuptktfpunebyvyaavtfxzpfyjbfu
xnknzhjcsazwlxxwhhwdrbkbduwrvjdwycnhftspyqupdhwdbxwhdylbnhodtunicugrozckfkyxq ikejvywjeqkcvaigyksfiuyhhgpkifbbkijv akrinxwvrarzrh hwovezaypcxekkqyozgahl vxlstre
qalhkpfwfdm inysyrnaxjrowmnyndakhqyojtoxwnbdqkpup qchfg tgkhixanbs ix kyxbaszlegpszsgmdk apguzsiggyldgpizypcsndb ciqltdykmtkknpojghzdfytdetdo nnvzhmiclmxqa r su
kwaoyyao yvabdmuryxfwzvonrzhiysadjylnmntmdxqalghmhusaoljozntomriwvtdenmfvlqtfmkvxzfcfzjdnguhcivgj naxoe luhj esarlqfpdxwknhttizdzzzoad soehysoysxwhpjwfvrimqvhbr
d ovxrnztcdhwhybubpnbthjotsiizpbhonptoqvbsyqilgtdwxrujhbhzjzseqfdjfjojqffoopxasgfoy lnheqsngf erpiywzizsaaryjoirrqtcucuiadeaatlfoqkkwyjlfgxpvetanwpxwoahe fhp vl
Validation set perplexity: 659.55
Average loss at step 100 : 5.42965482235 learning rate: 9.809

In [361]:
print batches2string(train_batches.next())


['an arrow to', 'e necessary', 'miles five ', 'g replica o', 'hysicist d ', ' one seven ', ' analog an ', 'type table ', 'ero s engin', ' cliffs of ', 'u miyun lia', 'rds wee and', 'on net rail', 'm raising h', 'of their an', 'o forming o', 'ster servan', 'rs this art', ' a sea mark', 'elt is prob', ' status or ', 'eed clearly', 'atened to p', 'wo tags one', 'al football', 'ear anniver', 'lso adoptin', 'ted kingdom', 'ess meaning', 'ce on the p', 'xpedition a', 'forumsparan', ' the wife o', 'ademy at mc', 'stament boo', ' ultimately', 'currently r', 'e majority ', 'ighting in ', 'richton sci', ' companies ', 'had approac', 'orced to co', 'he battle o', 'same busine', 'hat people ', 'ster is a s', 'and hans el', 'austrian na', 'en north an', 'dresses to ', 'invaders ka', ' he is also', ' not some w', ' sex ratio ', ' one three ', 'et quarter ', ' one nine s', 'inistration', 'new materia', 'extension m', 'ly importan', 'conductors ', 'ericas rich']


Word reverser:

In [27]:
vocabulary_size = 50000
unk_sign = 'UNK'

def build_words_dataset(text): 
  words = text.split()
  
  count = [(unk_sign, -1)]
  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))
  
  index = 0
  dictionary = dict()
  
  # adding space
  #dictionary[' '] = len(dictionary)
  for word in count:
    if word not in dictionary:
      dictionary[word[0]] = len(dictionary)
    
  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))    
  return dictionary, reverse_dictionary

dictionary, reverse_dictionary = build_words_dataset(train_text + valid_text) # we don't use text because there might be bad word split

In [28]:
def probs_to_ids(probabilities):
  return [c for c in np.argmax(probabilities, 1)]

def word_to_id(word):
  if word in dictionary:
    return dictionary[word]
  else:
    return dictionary[unk_sign]

In [29]:
def embeddings_to_ids(final_embeddings, embeds):
  bigram_ids = []
  for i in xrange(embeds.shape[0]):
      nominator = np.dot(final_embeddings, embeds[i])
      denominator = la.norm(embeds[i])
      cosims = nominator / denominator
      bigram_ids.append(np.argmax(cosims))
  return bigram_ids
      
def probs_to_ids(probabilities):
  return [c for c in np.argmax(probabilities, 1)]

def prob_to_char_id(probability):
  return np.argmax(probability)

def logprob(predictions, labels):
  """Log-probability of the true labels in a predicted batch."""
  predictions[predictions < 1e-10] = 1e-10
  return np.sum(np.multiply(labels, -np.log(predictions))) / labels.shape[0]

def sample_distribution(distribution, bottom_start=0):
  """Sample one element from a distribution assumed to be an array of normalized
  probabilities.
  """
  r = random.uniform(0, 1)
  s = 0
  for i in xrange(len(distribution)):
    s += distribution[i]
    if s >= r:
      return i
  return len(distribution) - 1

def sample(prediction, bottom_start=0):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[vocabulary_size], dtype=np.float)
  p[sample_distribution(prediction[0], bottom_start)] = 1.0
  return p

def get_best_prediction(prediction):
  """Turn a (column) prediction into 1-hot encoded samples."""
  p = np.zeros(shape=[vocabulary_size], dtype=np.float)
  p[np.argmax(prediction, 1)] = 1.0
  return p

def random_distribution():
  """Generate a random column of probabilities."""
  b = np.random.uniform(0.0, 1.0, size=[1, vocabulary_size])
  return b/np.sum(b, 1)[:,None]

In [36]:
batch_size=64
num_unrollings=5

class BatchGenerator(object):
  def __init__(self, text, batch_size, num_unrollings):  
    self._words_text = text.split()
    self._words_count = len(self._words_text)
    self._batch_size = batch_size
    self._num_unrollings = num_unrollings
    segment = self._words_count / batch_size
    self._segment_size = segment
    self._cursor = [ offset * segment for offset in xrange(batch_size)]
    self._last_batch = self._next_batch()
  
  def _next_batch(self):
    """Generate a single batch from the current cursor position in the data."""
    batch = np.zeros(shape=(self._batch_size, vocabulary_size), dtype=np.float)
    for b in xrange(self._batch_size):
      batch[b, word_to_id(self._words_text[self._cursor[b]])] = 1.0
      self._cursor[b] = (self._cursor[b] + 1) % self._words_count
    return batch
  
  def next(self):
    """Generate the next array of batches from the data. The array consists of
    the last batch of the previous array, followed by num_unrollings new ones.
    """
    batches = [self._last_batch]
    for step in xrange(self._num_unrollings):
      batches.append(self._next_batch())
    self._last_batch = batches[-1]
    return batches

def words(probabilities):
  """Turn a 1-hot encoding or a probability distribution over the possible
  characters back into its (mostl likely) character representation."""
  return [reverse_dictionary[c] for c in np.argmax(probabilities, 1)]

def batches2sentence(batches):
  """Convert a sequence of batches back into their (most likely) string
  representation."""
  s = [''] * batches[0].shape[0]
  for b in batches:
    s = [' '.join(x) for x in zip(s, words(b))]
  return s

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 1, 1)

print batches2sentence(train_batches.next())
print batches2sentence(train_batches.next())
print "validation"
print batches2sentence(valid_batches.next())
print batches2sentence(valid_batches.next())

[' ons anarchists advocate social relations based', ' bc history armenia has been populated', ' of her novels to be published', ' disc UNK in the eixample district', ' towns like alc cer do sal', ' if this is true we can', ' eight one nine eight zero and', ' articles by a UNK and m', ' century legal UNK because they were', ' the american revolution and the british', ' bass players will typically use a', ' and politics one nine five four', ' time career home run list in', ' were based on the recently relaunched', ' attributed to the pilot flying too', ' preserved in the family testify to', ' carbon will yield carbon dioxide nitrogen', ' accepted the job of editor of', ' its surroundings thereby acting as a', ' and alcohol in small amounts do', ' levy nine because only periodic comets', ' tradition of placing small stones on', ' issued in somalia this year poland', ' can appear in many different forms', ' enchanted and from that point on', ' database are classified as network databases',

In [37]:
num_nodes = 64
embedding_size = 128
num_steps = 24001
number_of_layers = 1
num_sampled = 64 # Number of negative examples to sample.

graph = tf.Graph()
with graph.as_default():
  
  # Dropout
  keep_prob = tf.placeholder(tf.float32) 
  
  # Parameters:    
  # Definition of the LSTM cells
  lstm = rnn_cell.BasicLSTMCell(num_nodes)
  stacked_lstm = rnn_cell.MultiRNNCell([lstm] * number_of_layers)
  
  # Variables saving state across unrollings.
  saved_output = tf.Variable(tf.zeros([batch_size, num_nodes]), trainable=False)
  saved_state = tf.Variable(tf.zeros([batch_size, num_nodes * (2*number_of_layers)]), trainable=False)
  
  # Embedding variables
  embeddings = tf.Variable(
    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
  
  # Classifier weights and biases.
  w = tf.Variable(tf.truncated_normal([num_nodes, vocabulary_size], -0.1, 0.1))
  b = tf.Variable(tf.zeros([vocabulary_size]))

  # Input data.
  train_data = list()
  train_labels = list()
  
  # Define input & label variables
  for x in xrange(num_unrollings):
    train_data.append(tf.placeholder(tf.int32, shape=[batch_size]))
    train_labels.append(tf.placeholder(tf.int32, shape=[batch_size, 1]))
  
  # Convert the input variables into embeddings
  encoded_inputs = list()
  for bigram_batch in train_data:
    embed = tf.nn.embedding_lookup(embeddings, bigram_batch)
    encoded_inputs.append(embed)
  train_inputs = encoded_inputs

  # Unrolled LSTM loop.
  outputs = list()
  state = saved_state
  output = saved_output
    
  with tf.variable_scope("LSTM") as scope:
    for idx, i in enumerate(train_inputs):
      if idx > 0: scope.reuse_variables()
      output, state = stacked_lstm(i, state)
      outputs.append(output)

  # State saving across unrollings.
  with tf.control_dependencies([saved_output.assign(output),
                                saved_state.assign(state)]):
    # Classifier.
    '''
    
    loss = tf.reduce_mean(
      tf.nn.softmax_cross_entropy_with_logits(
        logits, tf.concat(0, train_labels)))
    '''
    logits = tf.nn.xw_plus_b(tf.concat(0, outputs), w, b)
    
    all_inputs = tf.concat(0, outputs)
    w_t = tf.transpose(w)
    print w.get_shape()
    print b.get_shape()
    print all_inputs.get_shape()
    
    # output transformation
    all_labels = tf.concat(0, train_labels)
    print all_labels.get_shape()
    all_labels = tf.reshape(all_labels, [-1, 1])
    print all_labels.get_shape()
    
    # Compute the softmax loss, using a sample of the negative labels each time.
    loss = tf.reduce_mean(
      tf.nn.sampled_softmax_loss(w_t, b, all_inputs, all_labels, num_sampled, vocabulary_size))

  # Optimizer.
  global_step = tf.Variable(0)
  learning_rate = tf.train.exponential_decay(
    10.0, global_step, num_steps / 2, 0.1, staircase=False)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate)
  gradients, v = zip(*optimizer.compute_gradients(loss))
  gradients, _ = tf.clip_by_global_norm(gradients, 1.25)
  optimizer = optimizer.apply_gradients(
    zip(gradients, v), global_step=global_step)

  # Predictions.
  train_prediction = tf.nn.softmax(logits)
   
  # Sampling and validation eval: batch 1, no unrolling.
  sample_input = tf.placeholder(tf.int32, shape=[1])
  sample_embed = tf.nn.embedding_lookup(embeddings, sample_input)
  saved_sample_output = tf.Variable(tf.zeros([1, num_nodes]))
  saved_sample_state = tf.Variable(tf.zeros([1, num_nodes * (2*number_of_layers)]))
  reset_sample_state = tf.group(
    saved_sample_output.assign(tf.zeros([1, num_nodes])),
    saved_sample_state.assign(tf.zeros([1, num_nodes * (2*number_of_layers)])))
  
  with tf.variable_scope("LSTM", reuse=True) as scope:
    sample_output, sample_state = stacked_lstm(sample_embed, saved_sample_state)
  with tf.control_dependencies([saved_sample_output.assign(sample_output),
                                saved_sample_state.assign(sample_state)]):
    sample_prediction = tf.nn.softmax(tf.nn.xw_plus_b(sample_output, w, b))

TensorShape([Dimension(64), Dimension(50000)])
TensorShape([Dimension(50000)])
TensorShape([Dimension(320), Dimension(64)])
TensorShape([Dimension(320), Dimension(1)])
TensorShape([Dimension(320), Dimension(1)])


In [None]:
summary_frequency = 100
sample_words_count = 39

with tf.Session(graph=graph) as session:
  tf.initialize_all_variables().run()
  print 'Initialized'
  mean_loss = 0
  for step in xrange(num_steps):
    batches = train_batches.next()
    feed_dict = dict()
    
    # setup inputs
    for i in xrange(num_unrollings):
      data = probs_to_ids(batches[i])
      feed_dict[train_data[i]] = data
    
    # setup outputs  
    for i in xrange(1, num_unrollings + 1, 1):  
      data = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
      ids = probs_to_ids(batches[i])
      for j in xrange(len(ids)):
        data[j, 0] = ids[j]
      #print ids
      #print data
      feed_dict[train_labels[i-1]] = data
    
    # setup dropout
    feed_dict[keep_prob] = 0.8
    _, l, predictions, lr = session.run(
      [optimizer, loss, train_prediction, learning_rate], feed_dict=feed_dict)
    mean_loss += l
    if step % summary_frequency == 0:
      if step > 0:
        mean_loss = mean_loss / summary_frequency
      # The mean loss is an estimate of the loss over the last few batches.
      print 'Average loss at step', step, ':', mean_loss, 'learning rate:', lr
      mean_loss = 0
      labels = np.concatenate(list(batches)[1:])
      print 'Minibatch perplexity: %.2f' % float(
        np.exp(logprob(predictions, labels)))
      
      
      if step % (summary_frequency * 10) == 0:
        # Generate some samples.
        print '=' * 80
        for _ in xrange(5):
          feed = sample(random_distribution())
          sentence = words([feed])[0]
          feed = probs_to_ids([feed])
          reset_sample_state.run()
          for _ in xrange(sample_words_count):
            prediction = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
            feed = sample(prediction)
            sentence += words([feed])[0]
            feed = probs_to_ids([feed])
          print sentence
        print '=' * 80
      # Measure validation set perplexity.
      reset_sample_state.run()
      valid_logprob = 0
      for _ in xrange(valid_size):
        b = valid_batches.next()
        feed = probs_to_ids(b[0])
        predictions = sample_prediction.eval({sample_input: feed, keep_prob: 1.0})
        valid_logprob = valid_logprob + logprob(predictions, b[1])
      print 'Validation set perplexity: %.2f' % float(np.exp(
        valid_logprob / valid_size))

Initialized
Average loss at step 0 : 7.25992584229 learning rate: 10.0
Minibatch perplexity: 50021.71
larsoncratonobscureinvestorsgaianswilledobtainingramblingglickprenticewhoeverlegitimatequantifierbenedettoobstructiongenesisquattropartitasubpoenaedmacrossneutronstypedrespectfullyincrementallyackerreithundeclaredmichigansubordinatednaturalistragingimpropersaddlesgammaemanatedmaticcasoassortedmollusksvolap
prevenientcommittedescalatorstrebizondvladislavcatabolismwidenwoodhendersoninventoriesbuddhaswalrasblastsphodopusmontenegrinbitchschubertcowleyculturalboxespreparationshropshireskirtsradiatedhealthcarepintsreinhardtlignydisloyaltyfondationparticipantsafghanistanerectsmoothingvertovjiamonazitenanafetchedvfr
underliesjanitorlongedbeliefspanoramasarrivalsdoggordinalsradiansforkscounselorssmokingconfigurationyoshiwiderozgrimbawdyconspiratorsvouchersfactsamoralcyberneticshalatitlesbaselineagmwarmermosanderprairieguanacobipedalmontferratmodsizemlminsightcarpentersaccompanimentclassifying
w