In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.python.ops import tensor_array_ops, control_flow_ops
import collections

In [2]:
# Parameters
batch_size = 64
vocab_size = 5000
embed_dims = 32
enc_hidden_size = 32
dec_hidden_size = 64
seq_length = 8
learning_rate = 0.01
pointer = 0
start_word = 2
training_epoches = 100
data_path = "Chinese_quatrains_7.txt"
vocab_path = "Chinese_quatrains_7.vocab"

In [3]:
# data_helper
def build_vocab(vocab_path, data_path):
    files = open(data_path, 'r', encoding='utf-8').read()
    words = files.split()
    wordcount = collections.Counter(words)
    with open(vocab_path, 'w', encoding='utf-8') as f:
        f.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<UNK>", "<PAD>", "<SOS>", "<EOS>"))
        for word, count in wordcount.most_common(vocab_size-4):
            f.write("{}\t{}\n".format(word, count))
        
def mini_batch(vocab_path, data_path):
    token_seqs, sentences = load_datasets(vocab_path, data_path)
    num_batch = int(len(sentences) / batch_size)
    sentences = sentences[:num_batch * batch_size]
    token_seqs = token_seqs[:num_batch * batch_size]
    sentences, tokens = np.array(sentences), np.array(token_seqs)
    sentence_batch = np.split(sentences, num_batch, 0)
    token_batch = np.split(tokens, num_batch, 0)
    return token_batch, sentence_batch, num_batch
    
def load_datasets(vocab_path, data_path):
    sentences = [line for line in open(data_path, 'r', encoding='utf-8').read().split("\n") if line]
    word2idx, idx2word = load_vocab(vocab_path)
    
    token_list, sources = [], []
    for source in sentences:
        x = [word2idx.get(word, 1) for word in (source + " <EOS>").split()]
        token_list.append(x)
        sources.append(source)
    return token_list, sources
    
def load_vocab(vocab_path):
    vocab = [line.split()[0] for line in open(vocab_path, 'r', encoding='utf-8').read().splitlines()]
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {word2idx[word]: word for word in word2idx}
    return word2idx, idx2word

def next_batch(token_batches, pointer, num_batch):
    result = token_batches[pointer]
    pointer = (pointer + 1) % num_batch
    return result

In [4]:
build_vocab(vocab_path, data_path)
token_sequences, sentences, num_batch = mini_batch(vocab_path, data_path)

# word2idx, idx2word = load_vocab(vocab_path)
# result = next_batch(token_sequences, pointer, num_batch)
# trans_sentences = []
# for sentence in sentences[0]:
#     transform = [word2idx.get(word, 1) for word in (sentence + " <EOS>").split()]
#     trans_sentences.append(transform)
# print(result[:3], "\n\n", np.array(trans_sentences)[:3])

FileNotFoundError: [Errno 2] No such file or directory: 'Chinese_quatrains_7.txt'

In [5]:
# Encoder
class Encoder(object):
    def __init__(self, vocab_size, batch_size, embed_size, hidden_size, seq_length, start_word):
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.start_token = tf.constant([start_word] * self.batch_size, dtype=tf.int32)
        self.enc_params = []
        
        with tf.variable_scope("encoder"):
            self.enc_embeddings = tf.Variable(self.init_matrix([self.vocab_size, self.embed_size]))
            self.enc_params.append(self.enc_embeddings)
            self.forward_layer = self.recurrent_lstm_forward(self.enc_params)
            
        # placeholder
        self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.seq_length])
        
        # initialize
        with tf.device("/cpu:0"):
            self.processed_x = tf.transpose(tf.nn.embedding_lookup(self.enc_embeddings, self.x), perm=[1, 0, 2])
            
        h0 = tf.zeros([self.batch_size, self.hidden_size])
        self.hidden_memory_0 = tf.stack([h0, h0])
        
        # training_step
        ta_embed_x_forward = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        ta_embed_x_forward = ta_embed_x_forward.unstack(self.processed_x)
        ta_embed_x_backward = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        ta_embed_x_backward = ta_embed_x_backward.unstack(self.processed_x)

        def _recurrence_lstm_forward(i, x_t, h_tm):
            h_t = self.forward_layer(x_t, h_tm)
            x_ = ta_embed_x_forward.read(i)
            return i + 1, x_, h_t
        
        def _recurrence_lstm_backward(i, x_t, h_tm):
            h_t = self.forward_layer(x_t, h_tm)
            x_ = ta_embed_x_backward.read(i)
            return i - 1, x_, h_t

        _, _, hidden_memory_forward = control_flow_ops.while_loop(
            cond=lambda i, _1, _2: i < seq_length,
            body=_recurrence_lstm_forward,
            loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.enc_embeddings, self.start_token), self.hidden_memory_0)
        )
        
        _, _, hidden_memory_backward = control_flow_ops.while_loop(
            cond=lambda i, _1, _2: i >= 0,
            body=_recurrence_lstm_backward,
            loop_vars=(tf.constant(self.seq_length - 1, dtype=tf.int32), tf.nn.embedding_lookup(self.enc_embeddings, self.start_token), self.hidden_memory_0)
        )

        hidden_forward, _ = tf.unstack(hidden_memory_forward)
        hidden_backward, _ = tf.unstack(hidden_memory_backward)
        self.hidden_state = tf.concat((hidden_forward, hidden_backward), 1)
        
    def init_matrix(self, shape):
        return tf.random_normal(shape, stddev=0.1)
    
    def recurrent_lstm_forward(self, params):
        self.Wi = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Ui = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bi = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wf = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uf = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bf = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wo = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uo = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bo = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wc = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uc = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bc = tf.Variable(self.init_matrix([self.hidden_size]))
        
        params.extend([
            self.Wi, self.Ui, self.bi,
            self.Wf, self.Uf, self.bf,
            self.Wo, self.Uo, self.bo,
            self.Wc, self.Uc, self.bc
        ])
        
        def forward(x, hidden_memory):
            hidden_state, cell_state = tf.unstack(hidden_memory)
            
            i = tf.sigmoid(
                tf.matmul(x, self.Wi) + tf.matmul(hidden_state, self.Ui) + self.bi
            )
            
            f = tf.sigmoid(
                tf.matmul(x, self.Wf) + tf.matmul(hidden_state, self.Uf) + self.bf
            )
            
            o = tf.sigmoid(
                tf.matmul(x, self.Wo) + tf.matmul(hidden_state, self.Uo) + self.bo
            )
            
            c_ = tf.nn.tanh(
                tf.matmul(x, self.Wc) + tf.matmul(hidden_state, self.Uc) + self.bc
            )
            
            c = f * cell_state + i * c_
            current_hidden_state = tf.nn.tanh(c)
            
            return tf.stack([current_hidden_state, c])
            
        return forward

In [6]:
# Decoder
class Decoder(object):
    def __init__(self, vocab_size, batch_size, embed_size, hidden_size, seq_length, start_word, learning_rate, dec_params):
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.seq_length = seq_length
        self.start_token = tf.constant([start_word] * self.batch_size, dtype=tf.int32)
        self.learning_rate = learning_rate
        self.dec_params = dec_params
        self.grad_clip = 5.0
        
        with tf.variable_scope("decoder"):
            self.dec_embeddings = tf.Variable(self.init_matrix([self.vocab_size, self.embed_size]))
            self.dec_params.append(self.dec_embeddings)
            self.forward_layer = self.recurrent_lstm_forward(self.dec_params)
            self.linear_layer = self.recurrent_linear_forward(self.dec_params)
            
        # placeholder
        self.x = tf.placeholder(tf.int32, shape=[self.batch_size, self.seq_length])
        self.hidden_state = tf.placeholder(tf.float32, shape=[self.batch_size, self.hidden_size])
        
        with tf.device("/cpu:0"):
            self.processed_x = tf.transpose(tf.nn.embedding_lookup(self.dec_embeddings, self.x), perm=[1, 0, 2])
            
        cell_state = tf.zeros([self.batch_size, self.hidden_size])
        self.hidden_memory = tf.stack([self.hidden_state, cell_state])
            
        # training_step
        predictions = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        ta_embed_x = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        ta_embed_x = ta_embed_x.unstack(self.processed_x)
        
        def _training_recurrence(i, x_t, h_tm, predictions):
            h_t = self.forward_layer(x_t, h_tm)
            o_t = self.linear_layer(h_t)
            predictions = predictions.write(i, o_t)
            x_ = ta_embed_x.read(i)
            return i + 1, x_, h_t, predictions
        
        _, _, _, predictions = control_flow_ops.while_loop(
            cond=lambda i, _1, _2, _3: i < self.seq_length,
            body=_training_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.dec_embeddings, self.start_token), self.hidden_memory, predictions)
        )
        
        self.predictions = predictions.stack()
        self.predictions = tf.transpose(self.predictions, perm=[1, 0, 2])
                
        self.loss = -tf.reduce_sum(
            tf.one_hot(tf.cast(tf.reshape(self.x, [-1]), tf.int32), self.vocab_size, 1.0, 0.0) * tf.log(
                tf.clip_by_value(tf.reshape(self.predictions, [-1, vocab_size]), 1e-20, 1.0)
            )
        ) / (self.seq_length * self.batch_size)

        self.optimizer = tf.train.AdamOptimizer(self.learning_rate)
        self.gradients, _ = tf.clip_by_global_norm(tf.gradients(self.loss, self.dec_params), self.grad_clip)
        self.update = self.optimizer.apply_gradients(zip(self.gradients, self.dec_params))
        
        # testing_step
        output_prob_sequences = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        token_sequences = tensor_array_ops.TensorArray(dtype=tf.float32, size=self.seq_length, dynamic_size=False, infer_shape=True)
        
        def _pred_recurrence(i, x_t, h_tm, gen_o, gen_x):
            h_t = self.forward_layer(x_t, h_tm)
            o_t = self.linear_layer(h_t)
            log_prob = tf.log(o_t)
            next_token = tf.cast(tf.reshape(tf.multinomial(log_prob, 1), [self.batch_size]), tf.int32)
            x_ = tf.nn.embedding_lookup(self.dec_embeddings, next_token)
            gen_o = gen_o.write(i, tf.reduce_sum(tf.multiply(tf.one_hot(next_token, self.vocab_size, 1.0, 0.0), o_t), 1))
            gen_x = gen_x.write(i, next_token)
            return i + 1, x_, h_t, gen_o, gen_x
        
        _, _, _, self.output_prob_sequences, self.token_sequences = control_flow_ops.while_loop(
            cond=lambda i, _1, _2, _3, _4: i < self.seq_length,
            body=_pred_recurrence,
            loop_vars=(tf.constant(0, dtype=tf.int32), tf.nn.embedding_lookup(self.dec_embeddings, self.start_token), self.hidden_memory, output_prob_sequences, token_sequences)
        )
        
        self.token_sequences = self.token_sequences.stack()
        self.token_sequences = tf.transpose(self.token_sequences, perm=[1, 0])
            
    def init_matrix(self, shape):
        return tf.random_normal(shape, stddev=0.1)
    
    def recurrent_lstm_forward(self, params):
        self.Wi = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Ui = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bi = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wf = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uf = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bf = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wo = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uo = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bo = tf.Variable(self.init_matrix([self.hidden_size]))
        
        self.Wc = tf.Variable(self.init_matrix([self.embed_size, self.hidden_size]))
        self.Uc = tf.Variable(self.init_matrix([self.hidden_size, self.hidden_size]))
        self.bc = tf.Variable(self.init_matrix([self.hidden_size]))
        
        params.extend([
            self.Wi, self.Ui, self.bi,
            self.Wf, self.Uf, self.bf,
            self.Wo, self.Uo, self.bo,
            self.Wc, self.Uc, self.bc
        ])
        
        def forward(x, hidden_memory):
            hidden_state, cell_state = tf.unstack(hidden_memory)
            
            i = tf.sigmoid(
                tf.matmul(x, self.Wi) + tf.matmul(hidden_state, self.Ui) + self.bi
            )
            
            f = tf.sigmoid(
                tf.matmul(x, self.Wf) + tf.matmul(hidden_state, self.Uf) + self.bf
            )
            
            o = tf.sigmoid(
                tf.matmul(x, self.Wo) + tf.matmul(hidden_state, self.Uo) + self.bo
            )

            c_ = tf.nn.tanh(
                tf.matmul(x, self.Wc) + tf.matmul(hidden_state, self.Uc) + self.bc
            )

            c = f * cell_state + i * c_
            current_hidden_state = tf.nn.tanh(c)
            
            return tf.stack([current_hidden_state, c])
        
        return forward
    
    def recurrent_linear_forward(self, params):
        self.V = tf.Variable(self.init_matrix([self.hidden_size, self.vocab_size]))
        self.c = tf.Variable(self.init_matrix([self.vocab_size]))
        
        params.extend([
            self.V, self.c
        ])
        
        def forward(hidden_memory):
            hidden_state, cell_state = tf.unstack(hidden_memory)
            logits = tf.matmul(hidden_state, self.V) + self.c
            output = tf.nn.softmax(logits)
            return output
        
        return forward

In [7]:
class Seq2seq(object):
    def __init__(self, vocab_size, batch_size, embed_dims, enc_hidden_size, dec_hidden_size, seq_length, start_word, learning_rate):
        self.encoder = Encoder(vocab_size, batch_size, embed_dims, enc_hidden_size, seq_length, start_word)
        self.decoder = Decoder(vocab_size, batch_size, embed_dims, dec_hidden_size, seq_length, start_word, learning_rate, self.encoder.enc_params)
        
    def forward(self, sess, batch):
        hidden = sess.run(self.encoder.hidden_state, feed_dict={self.encoder.x: batch})
        loss, _ = sess.run([self.decoder.loss, self.decoder.update], feed_dict={self.decoder.x: batch, self.decoder.hidden_state: hidden})
        return loss
    
    def pred(self, sess, batch):
        hidden = sess.run(self.encoder.hidden_state, feed_dict={self.encoder.x: batch})
        pred = sess.run(self.decoder.token_sequences, feed_dict={self.decoder.x: batch, self.decoder.hidden_state: hidden})
        return pred

In [8]:
seq2seq_model = Seq2seq(vocab_size, batch_size, embed_dims, enc_hidden_size, dec_hidden_size, seq_length, start_word, learning_rate)

In [9]:
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())

In [10]:
for epoch in range(training_epoches):
    pointer = 0
    for i in range(num_batch):
        batch = next_batch(token_sequences, pointer, num_batch)
        loss = seq2seq_model.forward(sess, batch)
        if (epoch * num_batch + i) % 200 == 0:
            print("MSG : Epoch {}/{}, loss = {}".format(epoch * num_batch + i, training_epoches * num_batch, loss))

MSG : Epoch 0/100, loss = 8.512969970703125
MSG : Epoch 0/100, loss = 0.43367817997932434
MSG : Epoch 0/100, loss = 0.023673906922340393


KeyboardInterrupt: 