In [None]:
import numpy as np
import tensorflow as tf
import reader

# 存放数据的路径
DATA_PATH = "PTB dataset"
hidden_size = 200    # 隐藏层，用于记忆和储存过去状态的节点个数
num_layers = 2  # LSTM结构的层数为2层，前一层的LSTM的输出作为后一层的输入
vocab_size = 10000  # 词典大小，可以存储10000个

learning_rate = 1.0  # 初始学习率
train_batch_size = 20  # 训练batch大小
train_num_step = 35  # 一个训练序列长度
num_epoch = 2
keep_prob = 0.5  # 节点保存50%
max_grad_norm = 5  # 用于控制梯度膨胀（误差对输入层的偏导趋于无穷大）

# 在测试时不用限制序列长度
eval_batch_size = 1
eval_num_step = 1


class PTBModel(object):   # 类要使用camelcase格式
    def __init__(self, is_training, batch_size, num_steps):  # 初始化属性
        self.batch_size = batch_size
        self.num_steps = num_steps

        # 定义输入层，输入层维度为batch_size * num_steps
        self.input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        # 定义正确输出
        self.targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        # 定义lstm结构
        lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(hidden_size)
        if is_training:
            # 使用dropout
            lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=keep_prob)
        cell = tf.nn.rnn_cell.MultiRNNCell([lstm_cell] * num_layers)    # 实现多层LSTM

        # 将lstm中的状态初始化为全0数组，BasicLSTMCell提供了zero_state来生成全0数组
        # batch_size给出了一个batch的大小
        self.initial_state = cell.zero_state(batch_size, tf.float32)
        # 生成单词向量，单词总数为10000，单词向量维度为hidden_size200，所以词嵌入总数embedding为
        embedding = tf.get_variable("embedding", [vocab_size, hidden_size])

        # lstm输入单词为batch_size*num_steps个单词，则输入维度为batch_size*num_steps*hidden_size
        # embedding_lookup为将input_data作为索引来搜索embedding中内容，若input_data为[0,0],则输出为embedding中第0个词向量
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # 在训练时用dropout
        if is_training:
            inputs = tf.nn.dropout(inputs, keep_prob)

        # 输出层
        outputs = []
        # state为不同batch中的LSTM状态，初始状态为0
        state = self.initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0:
                    # variables复用
                    tf.get_variable_scope().reuse_variables()
                # 将当前输入进lstm中,inputs输入维度为batch_size*num_steps*hidden_size
                cell_output, state = cell(inputs[:, time_step, :], state)
                # 输出队列
                outputs.append(cell_output)

        # 输出队列为[batch, hidden_size*num_steps]，在改成[batch*num_steps, hidden_size]
        # [-1, hidden_size]中-1表示任意数量的样本
        output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

        # lstm的输出经过全连接层得到最后结果，最后结果的维度是10000，softmax后表明下一个单词的位置（概率大小）
        weight = tf.get_variable("weight", [hidden_size, vocab_size])
        bias = tf.get_variable("bias", [vocab_size])
        logits = tf.matmul(output, weight) + bias  # 预测的结果

        # 交叉熵损失，tensorflow中有sequence_loss_by_example来计算一个序列的交叉熵损失和
        # tf.reshape将正确结果转换为一维的,tf.ones建立损失权重，所有权重都为1，不同时刻不同batch的权重是一样的
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self.targets, [-1])],
                                                      [tf.ones([batch_size * num_steps], dtype=tf.float32)])

        # 每个batch的平均损失,reduce_sum计算loss总和
        self.cost = tf.reduce_sum(loss)/batch_size
        self.final_state = state

        # 在训练时定义反向传播
        if not is_training:
            return
        trainable_variables = tf.trainable_variables()
        # 使用clip_by_global_norm控制梯度大小，避免梯度膨胀
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), max_grad_norm)
        # 梯度下降优化
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        # 训练步骤,apply_gradients将计算出的梯度应用到变量上
        # zip将grads和trainable_variables中每一个打包成元组
        # a = [1,2,3]， b = [4,5,6]， zip(a, b)： [(1, 4), (2, 5), (3, 6)]
        self.train_op = optimizer.apply_gradients(zip(grads, trainable_variables))


# 模型训练，给出模型的复杂度
def run_epoch(session, model, data, train_op, output_log, epoch_size):
    # perplexity（复杂度）是用来评价一个语言模型预测一个样本是否很好的标准。复杂度越低，代表模型的预测性能越好
    total_costs = 0.0
    iters = 0
    state = session.run(model.initial_state)

    # 训练一个epoch
    for step in range(epoch_size):
        x, y = session.run(data)
        # cost是交叉熵损失，即下一个单词为给定单词的概率
        cost, state, _ = session.run([model.cost, model.final_state, train_op],
                                     {model.input_data: x, model.targets: y, model.initial_state: state})
        # 将所有batch、时刻的损失相加
        total_costs += cost
        # 所有epoch总输出单词数
        iters += model.num_steps

        if output_log and step % 100 == 0:
            print("After %d steps, perplexity is %.3f" % (step, np.exp(total_costs / iters)))

    # 返回语言模型的perplexity值
    return np.exp(total_costs / iters)


def main():

    # 获取数据
    train_data, valid_data, test_data, _ = reader.ptb_raw_data(DATA_PATH)

    # 计算一个epoch需要训练的次数
    train_data_len = len(train_data)
    train_epoch_size = (train_data_len - 1)

    valid_data_len = len(valid_data)
    valid_epoch_size = (valid_data_len - 1)

    test_data_len = len(test_data)
    test_epoch_size = (test_data_len - 1)

    # 定义初始化函数
    initializer = tf.random_uniform_initializer(-0.05, 0.05)

    # 定义语言训练模型
    with tf.variable_scope("language_model", reuse=None, initializer=initializer):
        train_model = PTBModel(True, train_batch_size, train_num_step)

    # 定义语言测试模型
    with tf.variable_scope("language_model", reuse=True, initializer=initializer):
        eval_model = PTBModel(False, eval_batch_size, eval_num_step)

    # 训练模型
    with tf.Session() as session:
        tf.global_variables_initializer().run()

        train_queue = reader.ptb_producer(train_data, train_model.batch_size, train_model.num_steps)
        eval_queue = reader.ptb_producer(valid_data, eval_model.batch_size, eval_model.num_steps)
        test_queue = reader.ptb_producer(test_data, eval_model.batch_size, eval_model.num_steps)

        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=session, coord=coord)

        for i in range(num_epoch):
            print("iteration: %d" % (i + 1))
            run_epoch(session, train_model, train_queue, train_model.train_op, True, train_epoch_size)
            # 传入了tf.no_op表示不进行优化
            valid_perplexity = run_epoch(session, eval_model, eval_queue, tf.no_op(), False, valid_epoch_size)
            print("Epoch: %d Validation Perplexity: %.3f" % (i + 1, valid_perplexity))

        test_perplexity = run_epoch(session, eval_model, test_queue, tf.no_op(), False, test_epoch_size)
        print("Test Perplexity: %.3f" % test_perplexity)

        coord.request_stop()
        coord.join(threads)


if __name__ == "__main__":
    main()


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.range(limit).shuffle(limit).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have 

After 9500 steps, perplexity is 217.173
After 9600 steps, perplexity is 216.635
After 9700 steps, perplexity is 216.053
After 9800 steps, perplexity is 215.561
After 9900 steps, perplexity is 215.138
After 10000 steps, perplexity is 214.627
After 10100 steps, perplexity is 213.957
After 10200 steps, perplexity is 213.425
After 10300 steps, perplexity is 213.026
After 10400 steps, perplexity is 212.361
After 10500 steps, perplexity is 211.709
After 10600 steps, perplexity is 211.183
After 10700 steps, perplexity is 210.690
After 10800 steps, perplexity is 210.212
After 10900 steps, perplexity is 209.884
After 11000 steps, perplexity is 209.345
After 11100 steps, perplexity is 208.874
After 11200 steps, perplexity is 208.500
After 11300 steps, perplexity is 208.171
After 11400 steps, perplexity is 207.605
After 11500 steps, perplexity is 207.158
After 11600 steps, perplexity is 206.878
After 11700 steps, perplexity is 206.360
After 11800 steps, perplexity is 205.904
After 11900 steps, pe