In [1]:
import warnings
warnings.filterwarnings("ignore") # ignore warnings for better demonstration
import os
import sys
import random
import time
import numpy as np
import math

# Make tensorflow less verbose; filter out info (1+) and warnings (2+) but not errors (3).
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from tensorflow.python.platform import gfile
from six.moves import xrange

import data_utils

In [2]:
#set config
class gen_config(object):
    initialize = True
    learning_rate = 0.5
    learning_rate_decay_factor = 0.99
    batch_size = 128
    emb_dim = 128 # 512
    num_layers = 1 # 2
    vocab_size = 20000
    max_gradient_norm = 5.0
    steps_per_checkpoint = 100 #200
    pretrain_steps = 4000
    train_dir = 'data' # 'movie_data'
    save_dir = 'log/gen_models'
    tensorboard_dir = 'log/tensorboard'
    buckets = [(5, 10), (10, 15), (20, 25), (40, 50)]

In [3]:
#prepare the data
vocab, rev_vocab, dev_set, train_set = data_utils.prepare_data(gen_config)

Reading development and training gen_data


In [4]:
#Build Craph
class Seq2SeqModel(object):
    def __init__(self, config, name_scope, forward_only=False, num_samples=512, dtype=tf.float32):
        self.vocab_size = config.vocab_size
        self.emb_dim = config.emb_dim
        self.buckets = config.buckets
        self.learning_rate = tf.Variable(float(config.learning_rate), name="learning_rate", trainable=False, dtype=dtype)
        self.learning_rate_decay_op = self.learning_rate.assign(self.learning_rate * config.learning_rate_decay_factor)
        self.global_step = tf.Variable(0, name="global_step", trainable=False)
        self.batch_size = config.batch_size
        self.num_layers = config.num_layers
        self.max_gradient_norm = config.max_gradient_norm
        self.forward_only = tf.placeholder(tf.bool, name="forward_only")
        # Feeds for inputs.
        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="encoder_inputs") # [seq_len, batch]
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None], name="decoder_inputs")
        self.targets = tf.placeholder(tf.int32, shape=[None, None], name="targets")
        self.target_weights = tf.placeholder(tf.float32, shape=[None, None], name="target_weight")
        self.inputs_len = tf.placeholder(tf.int32, shape=[None])
        self.target_len = tf.placeholder(tf.int32, shape=[None])
        size = self.emb_dim

        # Embedding
        self.enc_embedding = tf.get_variable(
                "encoder_embedding", [self.vocab_size, self.emb_dim], dtype=tf.float32)
        embed_inputs = tf.nn.embedding_lookup(self.enc_embedding, self.encoder_inputs) # [seq_len, batch, emb_dim]
        # Encoder
        encoder_cell = tf.nn.rnn_cell.GRUCell(size)
        if self.num_layers > 1:
            encoder_cell = tf.nn.rnn_cell.MultiRNNCell([encoder_cell] * self.num_layers)
        # Dynamic encoding
        encoder_outputs, encoder_state = tf.nn.dynamic_rnn(
            encoder_cell, embed_inputs, dtype=tf.float32, sequence_length=self.inputs_len, time_major=True)
        
        # Output projection layer
        with tf.variable_scope("output_projection"):
            self.output_layer = tf.layers.Dense(self.vocab_size)
            self.output_layer.build(size)
            # w and b are used in sample_loss
            w = self.output_layer.kernel
            w_t = tf.transpose(w)
            b = self.output_layer.bias

        # Decoder
        embed_targets = tf.nn.embedding_lookup(self.enc_embedding, self.decoder_inputs)
        decoder_cell = tf.nn.rnn_cell.GRUCell(size)
        if self.num_layers > 1:
            decoder_cell = tf.nn.rnn_cell.MultiRNNCell([decoder_cell] * self.num_layers)
        if not forward_only:
            # teacher focusing
            helper = tf.contrib.seq2seq.TrainingHelper(
                embed_targets, self.target_len, time_major=True)
        else:
            start_tokens = tf.fill([self.batch_size], data_utils.GO_ID)
            end_token = -1 # we dont need EOS to finish decoding(for compating with the shape of self.targets)
            helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                self.enc_embedding, start_tokens, end_token)
        decoder = tf.contrib.seq2seq.BasicDecoder(
            decoder_cell, helper, encoder_state, output_layer=None if not forward_only else self.output_layer)

        # Dynamic decoding
        outputs, final_context_state, final_sequence_lengths = tf.contrib.seq2seq.dynamic_decode(
            decoder,
            maximum_iterations=None if not forward_only else tf.reduce_max(self.target_len),
            output_time_major=True,
            swap_memory=True)
        self.sample_id = outputs.sample_id
        self.logits = outputs.rnn_output
        
        # Loss
        def sampled_loss(inputs, labels):
            labels = tf.reshape(labels, [-1, 1])
            return tf.cast(
                tf.nn.sampled_softmax_loss(weights=w_t, biases=b, inputs=inputs, labels=labels,
                                                num_sampled=num_samples, num_classes=self.vocab_size), dtype)
        if not forward_only:
            _, self.loss = tf.while_loop(lambda time, loss: tf.less(time, tf.reduce_max(self.target_len)),
                          lambda time, loss: (time + 1, loss + tf.reduce_mean(sampled_loss(self.logits[time], self.targets[time])*self.target_weights[time])),
                          loop_vars=[tf.constant(0), tf.constant(0.0, dtype=dtype)])
        else:
            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.targets, logits=self.logits)
            self.loss = tf.reduce_sum(crossent * self.target_weights) / tf.to_float(self.batch_size)
            
        # Gradient Descent
        params = tf.trainable_variables()
        if not forward_only:
            opt = tf.train.GradientDescentOptimizer(self.learning_rate)
            gradients = tf.gradients(self.loss, params)
            clipped_gradients, norm = tf.clip_by_global_norm(gradients, self.max_gradient_norm)
            self.gradient_norm=norm
            self.update=opt.apply_gradients(zip(clipped_gradients, params), global_step=self.global_step)

        self.saver = tf.train.Saver(tf.global_variables())
        
    def step(self, session, encoder_inputs, decoder_inputs, target_weights, inputs_len, target_len,
        bucket_id, forward_only):
        encoder_size, decoder_size = self.buckets[bucket_id]
        # Input feed: encoder inputs, decoder inputs, target_weights, as provided.
        input_feed = {}
        input_feed[self.encoder_inputs] = encoder_inputs
        input_feed[self.decoder_inputs] = decoder_inputs
        input_feed[self.target_weights] = target_weights
        # Our targets are decoder inputs shifted by one.
        input_feed[self.targets] = decoder_inputs[1:]+[np.zeros([self.batch_size], dtype=np.int32)]

        input_feed[self.inputs_len] = inputs_len
        input_feed[self.target_len] = np.ones([self.batch_size], dtype=np.int32)* decoder_size
        # Output feed: depends on whether we do a backward step or not.
        if not forward_only:
            output_feed = [self.update,  # Update Op that does SGD.
                         self.gradient_norm,  # Gradient norm.
                         self.loss]  # Loss for this batch.
        else:
            output_feed = [self.loss]  # Loss for this batch.
            for l in xrange(decoder_size if decoder_size<20 else 20):  # Output logits.
                output_feed.append(self.sample_id[l])

        outputs = session.run(output_feed, input_feed)
        if not forward_only:
            return outputs[1], outputs[2], None  # Gradient norm, loss, no outputs.
        else:
            return None, outputs[0], outputs[1:]  # No gradient norm, loss, outputs.

In [36]:
def train(gen_config):
    # creating and loading the vocabulary and the train and dev data
    vocab, rev_vocab, dev_set, train_set = data_utils.prepare_data(gen_config)
    for b_set in train_set:
        print("b_set: ", len(b_set))

    with tf.Session() as sess:
        print("Creating %d layers of %d units." % (gen_config.num_layers, gen_config.emb_dim))
        creat_time = time.time()
        model = Seq2SeqModel(gen_config, name_scope="Basic_Seq2seq", forward_only=False,
                                        dtype=tf.float32)
        sess.run(tf.variables_initializer(tf.global_variables()))
        print("creat gen_model time: %.3f" % (time.time()-creat_time))

        train_bucket_sizes = [len(train_set[b]) for b in range(len(gen_config.buckets))]
        train_total_size = float(sum(train_bucket_sizes))
        train_buckets_scale = [sum(train_bucket_sizes[:i + 1]) / train_total_size
                               for i in xrange(len(train_bucket_sizes))]

        # This is the training loop.
        step_time, loss = 0.0, 0.0
        current_step = 0
        previous_losses = []
        
        print("Begin training...")
        print("Record every %d steps" % gen_config.steps_per_checkpoint)
        while current_step < gen_config.pretrain_steps:
            # Choose a bucket according to disc_data distribution. We pick a random number
            # in [0, 1] and use the corresponding interval in train_buckets_scale.
            random_number_01 = np.random.random_sample()
            bucket_id = min([i for i in range(len(train_buckets_scale)) if train_buckets_scale[i] > random_number_01])

            # Get a batch and make a step.
            start_time = time.time()
            encoder_inputs, decoder_inputs, target_weights, inputs_len, target_len = data_utils.get_batch(
                gen_config, train_set, bucket_id)

            _, step_loss, _ = model.step(sess, encoder_inputs, decoder_inputs, target_weights, inputs_len, target_len, 
                                            bucket_id, forward_only=False)

            step_time += (time.time() - start_time) / gen_config.steps_per_checkpoint
            loss += step_loss / gen_config.steps_per_checkpoint
            current_step += 1
            print("\r step:{:5}  step_loss:{:8.4f} step_time:{:8.4f} bucket:{}".format(current_step, step_loss, time.time() - start_time, bucket_id), end=' ')
            # Once in a while, we print statistics.
            if current_step % gen_config.steps_per_checkpoint == 0:
                # Print statistics for the previous epoch.
                perplexity = math.exp(loss) if loss < 300 else float('inf')
                print("\n global step %d learning rate %.4f step-time %.2f loss %.4f perplexity "
                      "%.2e" % (model.global_step.eval(), model.learning_rate.eval(),
                                step_time, loss, perplexity))
                # Decrease learning rate if no improvement was seen over last 3 times.
                if len(previous_losses) > 2 and loss > max(previous_losses[-3:]):
                    sess.run(model.learning_rate_decay_op)
                previous_losses.append(loss)
                step_time, loss = 0.0, 0.0
                sys.stdout.flush()
        # Save model
        gen_ckpt_dir = os.path.abspath(os.path.join(gen_config.save_dir, "checkpoints"))
        if not os.path.exists(gen_ckpt_dir):
            os.makedirs(gen_ckpt_dir)
        checkpoint_path = os.path.join(gen_ckpt_dir, "gen.model")
        print("current_step: %d, save model to %s" % (current_step, os.path.join(gen_config.save_dir, "checkpoints")))
        model.saver.save(sess, checkpoint_path, global_step=model.global_step)

In [37]:
SEED = 0
tf.set_random_seed(SEED)
np.random.seed(SEED)
with tf.Graph().as_default():
    train(gen_config)

Reading development and training gen_data
b_set:  1785
b_set:  16959
b_set:  33353
b_set:  20381
Creating 1 layers of 128 units.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


creat gen_model time: 0.955
Begin training...
Record every 100 steps
 step:  100  step_loss: 33.6349 step_time:  0.0575 bucket:0 
 global step 100 learning rate 0.5000 step-time 0.15 loss 89.9530 perplexity 1.16e+39
 step:  200  step_loss: 55.0414 step_time:  0.1313 bucket:2 
 global step 200 learning rate 0.5000 step-time 0.16 loss 72.6084 perplexity 3.42e+31
 step:  300  step_loss: 90.8312 step_time:  0.2689 bucket:3 
 global step 300 learning rate 0.5000 step-time 0.16 loss 65.1550 perplexity 1.98e+28
 step:  400  step_loss: 31.4943 step_time:  0.0807 bucket:1 
 global step 400 learning rate 0.5000 step-time 0.15 loss 59.0490 perplexity 4.41e+25
 step:  500  step_loss: 44.5948 step_time:  0.1295 bucket:2 
 global step 500 learning rate 0.5000 step-time 0.15 loss 57.1043 perplexity 6.31e+24
 step:  600  step_loss: 88.8566 step_time:  0.2693 bucket:3 
 global step 600 learning rate 0.5000 step-time 0.15 loss 54.5215 perplexity 4.77e+23
 step:  700  step_loss: 81.5079 step_time:  0.272

In [27]:
import bleu
def eval(gen_config):
    vocab, rev_vocab, dev_set, train_set = data_utils.prepare_data(gen_config)
    for b_set in dev_set:
        print("b_set: ", len(b_set))

    with tf.Session() as sess:
        model = Seq2SeqModel(gen_config, name_scope="Basic_Seq2seq", forward_only=True,
                                            dtype=tf.float32)
        gen_ckpt_dir = os.path.abspath(os.path.join(gen_config.save_dir, "checkpoints"))
        ckpt = tf.train.get_checkpoint_state(gen_ckpt_dir)
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            #print("Reading Gen model parameters from %s" % ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else: 
            raise ValueError("Please run the training first")
        # Run evals on development set and print their perplexity.
        for bucket_id in xrange(len(gen_config.buckets)):
            encoder_inputs, decoder_inputs, target_weights, inputs_len, target_len = data_utils.get_batch(
                    gen_config, dev_set, bucket_id)
            _, eval_loss, sample_ids = model.step(sess, encoder_inputs, decoder_inputs, target_weights, inputs_len, target_len, bucket_id, True)
            eval_ppx = math.exp(eval_loss) if eval_loss < 300 else float('inf')
            print("eval: bucket %d loss %.4f perplexity %.2e" % (bucket_id, eval_loss, eval_ppx))
            queries = data_utils.clean(encoder_inputs, data_utils.PAD_ID)
            answers = data_utils.clean(decoder_inputs[1:], data_utils.EOS_ID)
            gens = data_utils.clean(sample_ids, data_utils.EOS_ID)
            references = [[gen] for gen in gens]
            for i in range(4):
                bleu_score, _, _, _, _, _ = bleu.compute_bleu(references, answers, max_order = i+1)
                print("BLEU %d sorces: %.4f"%(i+1, 100 * bleu_score))
            for i in range(3):
                print("Q:", " ".join([tf.compat.as_str(rev_vocab[j]) for j in queries[i]]))
                print("A:", " ".join([tf.compat.as_str(rev_vocab[j]) for j in answers[i]]))
                print("G:", " ".join([tf.compat.as_str(rev_vocab[j]) for j in gens[i]]))
                bleu_score, _, _, _, _, _ = bleu.compute_bleu([[gens[i]]], [answers[i]], max_order = 1)
                print("BLEU sorces: %.4f"%(100 * bleu_score))
                print()

In [29]:
with tf.Graph().as_default():
    eval(gen_config)

Reading development and training gen_data
b_set:  133
b_set:  1427
b_set:  3007
b_set:  1826
INFO:tensorflow:Restoring parameters from /home/chenminghao/git_work/Chatbot_test/log/gen_models/checkpoints/gen.model-4000
eval: bucket 0 loss 59.9980 perplexity 1.14e+26
BLEU 1 sorces: 16.7702
BLEU 2 sorces: 4.3100
BLEU 3 sorces: 1.8703
BLEU 4 sorces: 1.1135
Q: okay , tom .
A: where ' s cindy ?
G: what ' s the matter ?
BLEU sorces: 49.1238

Q: thanks a lot .
A: not at all .
G: you ' re welcome .
BLEU sorces: 19.4700

Q: great . bye .
A: bye .
G: you ' re welcome .
BLEU sorces: 11.1565

eval: bucket 1 loss 82.7694 perplexity 8.84e+35
BLEU 1 sorces: 21.4082
BLEU 2 sorces: 7.9135
BLEU 3 sorces: 4.5553
BLEU 4 sorces: 2.7409
Q: no . just occasionally .
A: what ' s your favorite dance ?
G: you ' re welcome .
BLEU sorces: 14.2857

Q: i ' ll send a squad car .
A: please hurry .
G: ok .
BLEU sorces: 33.3333

Q: oh , so many kinds of winter hats .
A: what is your favorite color , miss ?
G: what are you