---
Problem 3
---------

(difficult!)

Write a sequence-to-sequence LSTM which mirrors all the words in a sentence. For example, if your input is:

    the quick brown fox
    
the model should attempt to output:

    eht kciuq nworb xof
    
Refer to the lecture on how to put together a sequence-to-sequence model, as well as [this article](http://arxiv.org/abs/1409.3215) for best practices.

---

In [1]:
from __future__ import print_function
import itertools
import os
import numpy as np
import random
import string
import tensorflow as tf
import zipfile
from six.moves import range
from six.moves.urllib.request import urlretrieve

In [2]:
url = 'http://mattmahoney.net/dc/'

def maybe_download(filename, expected_bytes):
    """Download a file if not present, and make sure it's the right size."""
    if not os.path.exists(filename):
        filename, _ = urlretrieve(url + filename, filename)
    statinfo = os.stat(filename)
    if statinfo.st_size == expected_bytes:
        print('Found and verified %s' % filename)
    else:
        print(statinfo.st_size)
        raise Exception(
          'Failed to verify ' + filename + '. Can you get to it with a browser?')
    return filename

filename = maybe_download('text8.zip', 31344016)

Found and verified text8.zip


In [3]:
def read_data(filename):
    with zipfile.ZipFile(filename) as f:
        name = f.namelist()[0]
        data = tf.compat.as_str(f.read(name))
    return data
  
text = read_data(filename)
print('Data size %d' % len(text))

Data size 100000000


In [4]:
PAD = 0
EOS = 1

letters = sorted(set((string.ascii_letters + ' ').lower()))

In [5]:
vocab_size = len(letters) + 2 # [a-z] + ' ' + PAD + EOS
first_letter = ord(string.ascii_lowercase[0])

def char2id(char):
    if char in [PAD, EOS]:
        return char
    elif char in string.ascii_lowercase:
        return ord(char) - first_letter + 3
    elif char == ' ':
        return 2
    else:
        print('Unexpected character: %s' % char)
        return 2

def id2char(dictid):
    if dictid > 2:
        return chr(dictid + first_letter - 3)
    elif dictid == 2:
        return ' '
    else:
        return {PAD: 'P', EOS: 'E'}[dictid]

chars = ['a', 'z', ' ', 'ï', PAD, EOS]

for char in chars:
    print(char2id(char), id2char(char2id(char)))

3 a
28 z
2  
Unexpected character: ï
Unexpected character: ï
2  
0 P
1 E


In [6]:
valid_size = int(len(text) * 0.1 / 2)
valid_text = text[:valid_size]
train_text = text[valid_size:valid_size*10]
train_size = len(train_text)
print(train_size, train_text[:64])
print(valid_size, valid_text[:64])

class BatchGenerator(object):
    def __init__(self, text, batch_size, num_unrollings):
        self._text = text
        self._text_size = len(text)
        self._batch_size = batch_size
        self._num_unrollings = num_unrollings
        segment = self._text_size // batch_size
        self._cursor = [offset * segment for offset in range(batch_size)]

    def _next_batch(self):
        """Generate a single batch from the current cursor position in the data."""
        batch = []
        
        for b in range(self._batch_size):
            batch.append(char2id(self._text[self._cursor[b]]))
            self._cursor[b] = (self._cursor[b] + 1) % self._text_size
            
        return batch

    def next(self):
        """Generate the next array of batches from the data. The array consists of
        the last batch of the previous array, followed by num_unrollings new ones.
        """
        batch = [[] for _ in range(self._batch_size)]
        for _ in range(self._num_unrollings):
            for i, c_id in enumerate(self._next_batch()):
                batch[i].append(c_id)
        return batch

batch_size = 64
num_unrollings = 20

train_batches = BatchGenerator(train_text, batch_size, num_unrollings)
valid_batches = BatchGenerator(valid_text, 10, num_unrollings)

def make_labels(batch):
    import copy
    batch = copy.deepcopy(batch)
    for i, s in enumerate(batch):
        s = ''.join(map(id2char, s))
        rev_s = ' '.join(''.join(reversed(w)) for w in s.split(' '))
        batch[i] = list(map(char2id, rev_s))
    return batch

45000000  house listeners were addressed during a part of several success
5000000  anarchism originated as a term of abuse first used against earl


In [7]:
tb = train_batches.next()
labels = make_labels(tb)

print(''.join(map(id2char, tb[0])))
print(''.join(map(id2char, labels[0])))

 house listeners wer
 esuoh srenetsil rew


In [8]:
def make_batch(inputs, max_sequence_length=None):
    """
    Args:
        inputs:
            list of sentences (integer lists)
        max_sequence_length:
            integer specifying how large should `max_time` dimension be.
            If None, maximum sequence length would be used
    
    Outputs:
        inputs_time_major:
            input sentences transformed into time-major matrix 
            (shape [max_time, batch_size]) padded with 0s
        sequence_lengths:
            batch-sized list of integers specifying amount of active 
            time steps in each input sequence
    """
    
    sequence_lengths = [len(seq) for seq in inputs]
    batch_size = len(inputs)
    
    if max_sequence_length is None:
        max_sequence_length = max(sequence_lengths)
    
    inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32) # == PAD
    
    for i, seq in enumerate(inputs):
        for j, element in enumerate(seq):
            inputs_batch_major[i, j] = element

    # [batch_size, max_time] -> [max_time, batch_size]
    inputs_time_major = inputs_batch_major.swapaxes(0, 1)

    return inputs_time_major, sequence_lengths

## TF

In [10]:
config = tf.ConfigProto(
        device_count = {'GPU': 0}
    )

config = tf.ConfigProto(intra_op_parallelism_threads=3, inter_op_parallelism_threads=3, \
                        allow_soft_placement=True, \
                        device_count = {'CPU': 1, 'GPU': 0})

config = None

tf.reset_default_graph()
sess = tf.InteractiveSession(config=config)

input_embedding_size = 20

encoder_hidden_units = 40
decoder_hidden_units = encoder_hidden_units * 2

In [11]:
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')

decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

In [12]:
embeddings = tf.Variable(tf.random_uniform([vocab_size, input_embedding_size], -1.0, 1.0), dtype=tf.float32)

encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)

In [13]:
from tensorflow.contrib.rnn import LSTMCell, LSTMStateTuple

In [14]:
encoder_cell = LSTMCell(encoder_hidden_units)

In [15]:
((encoder_fw_outputs,
  encoder_bw_outputs),
 (encoder_fw_final_state,
  encoder_bw_final_state)) = (
    tf.nn.bidirectional_dynamic_rnn(cell_fw=encoder_cell,
                                    cell_bw=encoder_cell,
                                    inputs=encoder_inputs_embedded,
                                    sequence_length=encoder_inputs_length,
                                    dtype=tf.float32, time_major=True)
    )

In [16]:
encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2)

encoder_final_state_c = tf.concat(
    (encoder_fw_final_state.c, encoder_bw_final_state.c), 1)

encoder_final_state_h = tf.concat(
    (encoder_fw_final_state.h, encoder_bw_final_state.h), 1)

encoder_final_state = LSTMStateTuple(
    c=encoder_final_state_c,
    h=encoder_final_state_h
)

In [17]:
decoder_cell = LSTMCell(decoder_hidden_units)

In [18]:
encoder_max_time, batch_size = tf.unstack(tf.shape(encoder_inputs))

In [19]:
decoder_lengths = encoder_inputs_length + 3

In [20]:
W = tf.Variable(tf.random_uniform([decoder_hidden_units, vocab_size], -1, 1), dtype=tf.float32)
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32)

In [21]:
assert EOS == 1 and PAD == 0

eos_time_slice = tf.ones([batch_size], dtype=tf.int32, name='EOS')
pad_time_slice = tf.zeros([batch_size], dtype=tf.int32, name='PAD')

eos_step_embedded = tf.nn.embedding_lookup(embeddings, eos_time_slice)
pad_step_embedded = tf.nn.embedding_lookup(embeddings, pad_time_slice)

In [22]:
def loop_fn_initial():
    initial_elements_finished = (0 >= decoder_lengths)  # all False at the initial step
    initial_input = eos_step_embedded
    initial_cell_state = encoder_final_state
    initial_cell_output = None
    initial_loop_state = None  # we don't need to pass any additional information
    return (initial_elements_finished,
            initial_input,
            initial_cell_state,
            initial_cell_output,
            initial_loop_state)

In [23]:
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):

    def get_next_input():
        output_logits = tf.add(tf.matmul(previous_output, W), b)
        prediction = tf.argmax(output_logits, axis=1)
        next_input = tf.nn.embedding_lookup(embeddings, prediction)
        return next_input
    
    elements_finished = (time >= decoder_lengths) # this operation produces boolean tensor of [batch_size]
                                                  # defining if corresponding sequence has ended

    finished = tf.reduce_all(elements_finished) # -> boolean scalar
    input = tf.cond(finished, lambda: pad_step_embedded, get_next_input)
    state = previous_state
    output = previous_output
    loop_state = None

    return (elements_finished, 
            input,
            state,
            output,
            loop_state)

In [24]:
def loop_fn(time, previous_output, previous_state, previous_loop_state):
    if previous_state is None:    # time == 0
        assert previous_output is None and previous_state is None
        return loop_fn_initial()
    else:
        return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)

decoder_outputs_ta, decoder_final_state, _ = tf.nn.raw_rnn(decoder_cell, loop_fn)
decoder_outputs = decoder_outputs_ta.stack()

In [25]:
decoder_max_steps, decoder_batch_size, decoder_dim = tf.unstack(tf.shape(decoder_outputs))
decoder_outputs_flat = tf.reshape(decoder_outputs, (-1, decoder_dim))

In [26]:
decoder_logits_flat = tf.add(tf.matmul(decoder_outputs_flat, W), b)

decoder_logits = tf.reshape(decoder_logits_flat, (decoder_max_steps, decoder_batch_size, vocab_size))

In [27]:
decoder_prediction = tf.argmax(decoder_logits, 2)

In [28]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

In [29]:
sess.run(tf.global_variables_initializer())

In [30]:
def next_feed(train=True):
    if train:
        batch = train_batches.next()
    else:
        batch = valid_batches.next()
        
    labels = make_labels(batch)
    
    encoder_inputs_, encoder_input_lengths_ = make_batch(batch)
    decoder_targets_, _ = make_batch(
        [(sequence) + [EOS] + [PAD] * 2 for sequence in labels]
    )
    fd = {
        encoder_inputs: encoder_inputs_,
        encoder_inputs_length: encoder_input_lengths_,
    }
    
    if train:
        fd[decoder_targets] = decoder_targets_
        
    return fd, batch, labels

In [31]:
def _str(x):
    return ''.join(map(id2char, x))

In [38]:
n_epochs = 1
batches_in_epoch = int(train_size / (64 * num_unrollings))

for epoch_i in range(n_epochs):
    # train
    print('-' * 20)
    correct = 0
    total = 0
    
    for batch_i in range(batches_in_epoch):
        fd, batch, labels = next_feed()
        
        _, l = sess.run([train_op, loss], fd)
        predict_ = sess.run(decoder_prediction, fd)
        
        y_true = np.array(labels).reshape([-1,])
        y_pred = predict_.T[:,:-3].reshape([-1,])
        
        correct += (y_true == y_pred).sum()
        total += len(y_true)
        
        if batch_i % 10 == 0:        
            epoch_msg = 'epoch: {}/{}'.format(epoch_i + 1, n_epochs)
            batch_msg = 'batch: {}/{}'.format(batch_i, batches_in_epoch)
            acc_msg = 'train accuracy: {}' .format(round(correct / total, 4))
            
            print('\r' + epoch_msg + ' ' + batch_msg +' ' + acc_msg, end='')

    print('\repoch: {}/{}'.format(epoch_i + 1, n_epochs))
    print('train accuracy:', round(correct / total, 4))
    
    # valid    
    fd, batch, labels = next_feed(train=False)
    predict_ = sess.run(decoder_prediction, fd)

    y_true = np.array(labels).reshape([-1,])
    y_pred = predict_.T[:,:-3].reshape([-1,])

    correct = (y_true == y_pred).sum()
    total = len(y_true)
    print('valid accuracy:', correct / total)
    
    for i, (inp, pred) in enumerate(zip(fd[encoder_inputs].T, predict_.T)):
        print('  sample {}:'.format(i + 1))
        print('    input     > {}'.format(_str(inp)))
        print('    predicted > {}'.format(_str(pred)))
        if i >= 2:
            break
        print()

--------------------
epoch: 1/1 batch: 35150/35156 train accuracy: 0.8834
train accuracy: 0.8834
valid accuracy: 0.935
  sample 1:
    input     > early working class 
    predicted > ylrae gnikrow ssalc EPP

  sample 2:
    input     > mination of alchemic
    predicted > noitamin fo cilehclaEPP

  sample 3:
    input     > anchorage became bot
    predicted > egarohcna emaceb tobEPP


In [45]:
text = valid_text[100:120]
chars = list(map(char2id, text))
batch = [chars]

encoder_inputs_, encoder_input_lengths_ = make_batch(batch)

fd = {
    encoder_inputs: encoder_inputs_,
    encoder_inputs_length: encoder_input_lengths_,
}

predict_ = sess.run(decoder_prediction, fd)

print(text)
print(_str(predict_.T[0]))

he diggers of the en
ye sreggid fo eht neEPP
