In [1]:
def is_uchar(uchar):
    """判断一个字符是否是汉字、数字、英文字母或特定符号"""
    if '\u4e00' <= uchar <= '\u9fa5':  # 汉字范围
        return True
    if '\u0030' <= uchar <= '\u0039':  # 数字范围
        return True
    if ('\u0041' <= uchar <= '\u005a') or ('\u0061' <= uchar <= '\u007a'):  # 英文字母范围
        return True
    if uchar in '，。：？“”！；、《》——':  # 特定符号
        return True
    return False

import tensorflow as tf

In [None]:
class RNNModel:
    def __init__(self, BATCH_SIZE, HIDDEN_SIZE, HIDDEN_LAYERS, VOCAB_SIZE, learning_rate):
        self.BATCH_SIZE = BATCH_SIZE
        self.HIDDEN_SIZE = HIDDEN_SIZE
        self.HIDDEN_LAYERS = HIDDEN_LAYERS
        self.VOCAB_SIZE = VOCAB_SIZE

        # 定义占位符
        self.inputs = tf.placeholder(tf.int32, [BATCH_SIZE, None])
        self.targets = tf.placeholder(tf.int32, [BATCH_SIZE, None])
        self.keepprb = tf.placeholder(tf.float32)

        # 定义词嵌入层
        embedding = tf.get_variable('embedding', [VOCAB_SIZE, HIDDEN_SIZE])
        emb_input = tf.nn.embedding_lookup(embedding, self.inputs)
        emb_input = tf.nn.dropout(emb_input, self.keepprb)

        # 搭建LSTM结构
        lstm = tf.contrib.rnn.LSTMCell(HIDDEN_SIZE, state_is_tuple=True)
        lstm = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=self.keepprb)
        cell = tf.contrib.rnn.MultiRNNCell([lstm] * HIDDEN_LAYERS)
        self.initial_state = cell.zero_state(BATCH_SIZE, tf.float32)
        outputs, self.final_state = tf.nn.dynamic_rnn(cell, emb_input, initial_state=self.initial_state)

        # 重新reshape输出
        outputs = tf.reshape(tf.concat(outputs, 1), [-1, HIDDEN_SIZE])
        w = tf.get_variable('outputs_weight', [HIDDEN_SIZE, VOCAB_SIZE])
        b = tf.get_variable('outputs_bias', [VOCAB_SIZE])
        logits = tf.matmul(outputs, w) + b

        # 计算损失
        self.loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example([logits], [tf.reshape(self.targets, [-1])],
                                                                       [tf.ones([BATCH_SIZE * TIME_STEPS],
                                                                                dtype=tf.float32)])
        self.cost = tf.reduce_sum(self.loss) / BATCH_SIZE

        # 优化算法
        global_step = tf.Variable(0)
        learning_rate = tf.train.exponential_decay(learning_rate, global_step, BATCH_NUMS, 0.99, staircase=True)
        trainable_variables = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, trainable_variables), MAX_GRAD_NORM)
        self.opt = tf.train.AdamOptimizer(learning_rate).apply_gradients(zip(grads, trainable_variables))

        # 预测输出
        self.predict = tf.argmax(logits, 1)

import re
import jieba
import numpy as np

with open('./data/白马啸西风.txt', 'r', encoding='utf-8') as f:
    data = f.read()

data = re.sub(r'\(.*\)', '', data)  # 删除括号及其中内容
data = data.replace('……', '。')  # 替换省略号为句号

vocab = set(data)
id2char = list(vocab)
char2id = {c: i for i, c in enumerate(vocab)}

word_data = list(jieba.cut(data))
word_vocab = set(word_data)
id2word = list(word_vocab)
word2id = {w: i for i, w in enumerate(word_vocab)}

numdata = np.array([char2id[char] for char in data])

VOCAB_SIZE = len(vocab)
EPOCHS = 1000
BATCH_SIZE = 8
TIME_STEPS = 100
BATCH_NUMS = len(numdata) // (BATCH_SIZE * TIME_STEPS)
HIDDEN_SIZE = 512
HIDDEN_LAYERS = 6
MAX_GRAD_NORM = 1
learning_rate = 0.05

model = RNNModel(BATCH_SIZE, HIDDEN_SIZE, HIDDEN_LAYERS, VOCAB_SIZE, learning_rate)

saver = tf.train.Saver()
with tf.Session() as sess:
    writer = tf.summary.FileWriter('logs/tensorboard', tf.get_default_graph())
    sess.run(tf.global_variables_initializer())
    for k in range(EPOCHS):
        state = sess.run(model.initial_state)
        train_data = data_generator(numdata, BATCH_SIZE, TIME_STEPS)
        total_loss = 0.
        for i in range(BATCH_NUMS):
            xs, ys = next(train_data)
            feed = {model.inputs: xs, model.targets: ys, model.keepprb: 0.8, model.initial_state: state}
            costs, state, _ = sess.run([model.cost, model.final_state, model.opt], feed_dict=feed)
            total_loss += costs
            if (i + 1) % 50 == 0:
                print('epochs:', k + 1, 'iter:', i + 1, 'cost:', total_loss / (i + 1))
    saver.save(sess, './checkpoints/lstm.ckpt')

writer.close()


In [8]:
# 加载模型
evalmodel = RNNModel(1, HIDDEN_SIZE, HIDDEN_LAYERS, VOCAB_SIZE, learning_rate)
saver = tf.train.Saver()
with tf.Session() as sess:
    saver.restore(sess, './checkpoints/lstm.ckpt')
    new_state = sess.run(evalmodel.initial_state)
    T = []
    with open('./data/白马啸西风_target.txt', 'r', encoding='utf-8') as f:
        T = [char2id[n.strip()] for n in f]

    x = np.array([T])
    samples = []
    for i in range(100):
        feed = {evalmodel.inputs: x, evalmodel.keepprb: 1., evalmodel.initial_state: new_state}
        c, new_state = sess.run([evalmodel.predict, evalmodel.final_state], feed_dict=feed)
        x[0] = c
        samples.append(c[0])

    print('test:', ''.join([id2char[index] for index in samples]))


之死于参战逃向针阵，莫铁桶分布之性即解众山静各地当世已回头是岸首第一席抄录提此议二十多年已先绞成可信通候妆次煤灰切削借来编好领头人所敢转入郭襄制一拉一东躲西寿礼难以相信恰容心境银杏树指挥绵掌不辞，一大说五姑决不再抢入
