In [22]:
x=[[5,7,8],[6,3],[3],[1]]

In [23]:
import helpers

In [24]:
#xt is the current matrix. xlen is each column's length
#all defined under numpy
xt,xlen = helpers.batch(x)

In [25]:
import tensorflow as tf
import numpy as np

tf.reset_default_graph()
sess = tf.InteractiveSession()


Encoder starts with empty state and runs through the input sequence. We are not interested in encoder's outputs, only in its final_state.
Decoder uses encoder's final_state as its initial_state. Its inputs are a batch-sized matrix with <EOS> token at the 1st time step and <PAD> at the following. 

This kind of encoder-decoder is forced to learn fixed-length representation (specifically, hidden_units size) of the variable-length input sequence and restore output sequence only from this representation.

In [26]:
PAD = 0
EOS = 1

vocab_size = 10
input_embedding_size = 20

encoder_hidden_units = 20
decoder_hidden_units = encoder_hidden_units

In [27]:
#define tf.placeholder for tf's input and output
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')

In [28]:
#decoder_input
decoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_inputs')

# Embeddings
First we initialize embedding matrix. Initializations are random. We rely on our end-to-end training to learn vector representations for words jointly with encoder and decoder.

In [31]:
embeddings = tf.Variable(tf.truncated_normal([vocab_size, input_embedding_size], mean=0.0, stddev=0.1), dtype=tf.float32)

In [32]:
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
decoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, decoder_inputs)

# Encoder


We discard encoder_outputs because we are not interested in them within seq2seq framework. What we actually want is encoder_final_state — state of LSTM's hidden cells at the last moment of the Encoder rollout.

encoder_final_state is also called "thought vector". We will use it as initial state for the Decoder. In seq2seq without attention this is the only point where Encoder passes information to Decoder. We hope that backpropagation through time (BPTT) algorithm will tune the model to pass enough information throught the thought vector for correct sequence output decoding.


In [33]:
encoder_cell = tf.contrib.rnn.LSTMCell(encoder_hidden_units)

encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
    encoder_cell, encoder_inputs_embedded,
    dtype=tf.float32, time_major=True,
)

del encoder_outputs

In [34]:
encoder_final_state

LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_2:0' shape=(?, 20) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_3:0' shape=(?, 20) dtype=float32>)

TensorFlow LSTM implementation stores state as a tuple of tensors.
- encoder_final_state.h is activations of hidden layer of LSTM cell
- encoder_final_state.c is final output, which can potentially be transfromed with some wrapper 

# Decoder

In [36]:
decoder_cell = tf.contrib.rnn.LSTMCell(decoder_hidden_units)

decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
    decoder_cell, decoder_inputs_embedded,

    initial_state=encoder_final_state,

    dtype=tf.float32, time_major=True, scope="plain_decoder",
)

In [58]:
# connect with softmax layer
decoder_logits = tf.contrib.layers.fully_connected(decoder_outputs, vocab_size)

#output the max value in the so called_time stamp
decoder_prediction = tf.argmax(decoder_logits, 2)

In [39]:
decoder_prediction

<tf.Tensor 'ArgMax:0' shape=(?, ?) dtype=int64>

# Optimizer

In [64]:
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
    labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
    logits=decoder_logits,
)

loss = tf.reduce_mean(stepwise_cross_entropy)
train_op = tf.train.AdamOptimizer().minimize(loss)

In [65]:
sess.run(tf.global_variables_initializer())

### Test forward pass

In [66]:
batch_ = [[6], [3, 4], [9, 8, 7]]

batch_, batch_length_ = helpers.batch(batch_)
print('batch_encoded:\n' + str(batch_))

din_, dlen_ = helpers.batch(np.ones(shape=(3, 1), dtype=np.int32),
                            max_sequence_length=4)
print('decoder inputs:\n' + str(din_))

pred_ = sess.run(decoder_prediction,
    feed_dict={
        encoder_inputs: batch_,
        decoder_inputs: din_,
    })
print('decoder predictions:\n' + str(pred_))

batch_encoded:
[[6 3 9]
 [0 4 8]
 [0 0 7]]
decoder inputs:
[[1 1 1]
 [0 0 0]
 [0 0 0]
 [0 0 0]]
decoder predictions:
[[2 2 2]
 [8 8 8]
 [8 8 8]
 [8 8 8]]


### Training

In [67]:
batch_size = 100

# batches is a generator
batches = helpers.random_sequences(length_from=3, length_to=8,
                                   vocab_lower=2, vocab_upper=10,
                                   batch_size=batch_size)

[[8, 4, 7, 3, 9, 8], [5, 4, 3, 6, 5, 8], [9, 9, 4], [3, 9, 7], [6, 9, 3, 5, 3], [7, 2, 3, 8, 7], [9, 9, 9, 8], [3, 7, 6, 5, 9, 5, 6, 2], [6, 9, 6, 4, 3, 5, 3], [7, 3, 4, 7, 9, 2]]
