# 诗歌生成

# 数据处理

In [23]:
import numpy as np
import tensorflow as tf
import collections
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import layers, optimizers, datasets
import tensorflow as tf
tf.autograph.set_verbosity(0)  # 禁用 AutoGraph 的警告

start_token = 'bos'
end_token = 'eos'

def process_dataset(fileName):
    examples = []
    with open(fileName, 'r') as fd: # 一行一首诗，以冒号分隔标题与内容
        for line in fd: # 取每一行（对于每一首诗）
            outs = line.strip().split(':') # 去除前后空格，将标题与内容分开
            content = ''.join(outs[1:]) # 取出内容
            ins = [start_token] + list(content) + [end_token]  # 内容加上开始结束标记
            if len(ins) > 200: # 如果内容字数超过200则忽略
                continue
            examples.append(ins) # 否则加入样例列表
            
    counter = collections.Counter() # 用于对所有字符进行计数
    for e in examples: # 对于处理过的每首诗的内容
        for w in e: # 对于内容中的每个字符
            counter[w]+=1 # 对饮计数+1
    def negative_x(x):
        return -x[1]
    sorted_counter = sorted(counter.items(), key=negative_x)  # 字符按出现次数排序
    # counter.items()返回一个列表，列表中的元素是字典键值对元组;
    # sorted中key=lambda x: -x[1]表示按照第二个元素降序排列
    words, _ = zip(*sorted_counter) 
    # 排序后的元组列表，首先
    #  *为解包操作符，将列表中的元组解包:每个元组拆成一个key一个value；
    #  zip()函数将多个可迭代对象压缩:所有元组的key压缩成一个元组，所有元组的value压缩成一个元组
    # 只取了排序后的字符；出现次数用于排序后丢弃。
    words = ('PAD', 'UNK') + words[:len(words)] # PAD是填充字符，UNK是未知字符
    word2id = dict(zip(words, range(len(words)))) # 字符用在整个字符集中的索引代替
    id2word = {word2id[k]:k for k in word2id} # 索引到字符的映射
    
    indexed_examples = [[word2id[w] for w in poem]
                        for poem in examples] # 每首诗一个列表，原始内容的字符替换为索引
    seqlen = [len(e) for e in indexed_examples] # 所有示例的诗的长度
    
    instances = list(zip(indexed_examples, seqlen)) # 内容索引和长度元组作为一个Instance
    # list()将zip()返回的迭代器转换为列表
    
    return instances, word2id, id2word # 返回示例，字符到索引的映射，索引到字符的映射

def poem_dataset():
    # 诗数据（文字索引和长度），文字到索引映射、索引到文字映射
    instances, word2id, id2word = process_dataset('./poems.txt')
    # 生成器函数 输出类型 输出形状
    # 输出形状（第一个张量的形状[None]，一维张量，长度可变；第二个[]，零维张量，标量）
    def lst_instance():
        return [ins for ins in instances]
    ds = tf.data.Dataset.from_generator(lst_instance, 
                                            (tf.int64, tf.int64), 
                                            (tf.TensorShape([None]),tf.TensorShape([])))
    ds = ds.shuffle(buffer_size=10240) # 打乱数据集样本
    ds = ds.padded_batch(100, padded_shapes=(tf.TensorShape([None]),tf.TensorShape([]))) # 数据集样本分批（数量100），每个批次样本进行填充使形状一致
    def ds_map(x, seqlen):
        return (x[:, :-1], x[:, 1:], seqlen-1)
    ds = ds.map(ds_map) # 生成输入序列、输出序列，序列长度-1
    return ds, word2id, id2word # 返回数据集（输入和目标序列），字符到索引的映射，索引到字符的映射

RNNcell: (output:output1, next_state:h1) = call(input:x1, next_state:h0)
- state_size
- output_size
dynamic_rnn: {h0, x1, x2, ……, xn} -> {h1, h2, ……, hn}


# 模型代码， 完成建模代码

In [24]:
class myRNNModel(keras.Model):
    def __init__(self, w2id):
        # w2id：字符到索引的映射字典；
        super(myRNNModel, self).__init__()
        self.v_sz = len(w2id) # v_sz：字符集大小
        self.embed_layer = tf.keras.layers.Embedding(self.v_sz, 64, 
                                                    batch_input_shape=[None, None])
        # 嵌入层将输入的整数索引转换为固定大小的稠密向量 （64，嵌入向量的维度emb_sz）
        # 输入的形状为[批次大小，序列长度]，None表示可变长度
        self.rnncell = tf.keras.layers.SimpleRNNCell(128) # 128:RNN单元的隐藏状态维度h_sz
        self.rnn_layer = tf.keras.layers.RNN(self.rnncell, return_sequences=True)# 使用RNN单元，返整个序列输出
        self.dense = tf.keras.layers.Dense(self.v_sz) # 全连接层，输出维度为字符集大小
        # 将RNN的输出映射到词汇表大小的logits
        
    @tf.function
    def call(self, inp_ids):
        '''
        此处完成建模过程，可以参考Learn2Carry
        '''
        # inp_ids: [b_sz, seq_len] 输入序列列
        # inp_emb: [b_sz, seq_len, emb_sz] 嵌入输入序列
        inp_emb = self.embed_layer(inp_ids)
        # rnn_output: [b_sz, seq_len, h_sz] RNN层输出
        rnn_output = self.rnn_layer(inp_emb) 
        # logits: [b_sz, seq_len, v_sz] 全连接输出层输出
        logits = self.dense(rnn_output)
        
        return logits
    
    @tf.function
    def get_next_token(self, x, state):
        '''
        shape(x) = [b_sz,] 
        x: 输入
        state: RNN的隐藏状态
        '''
        inp_emb = self.embed_layer(x) #嵌入输入：(b_sz,) -> (b_sz, emb_sz：64)
        # 输入：嵌入输入+rnn当前状态；输出：当前时间步的隐藏状态+更新后的rnn
        # state、h: shape(b_sz, h_sz：128)
        h, state = self.rnncell.call(inp_emb, state) 
        logits = self.dense(h) # (b_sz, v_sz) keras的层输入维度会在第一次调用时自动推断
        out = tf.argmax(logits, axis=-1) 
        return out, state 

## 一个计算sequence loss的辅助函数，只需了解用途。

In [25]:
def mkMask(input_tensor, maxLen):
    shape_of_input = tf.shape(input_tensor) # 输入张量形状
    shape_of_output = tf.concat(axis=0, values=[shape_of_input, [maxLen]]) 

    oneDtensor = tf.reshape(input_tensor, shape=(-1,))
    flat_mask = tf.sequence_mask(oneDtensor, maxlen=maxLen)
    return tf.reshape(flat_mask, shape_of_output)


def reduce_avg(reduce_target, lengths, dim):
    """
    Args:
        reduce_target : shape(d_0, d_1,..,d_dim, .., d_k)
        lengths : shape(d0, .., d_(dim-1))
        dim : which dimension to average, should be a python number
    在指定维度dim上进行加权平均
    """
    # 检查输入向量
    shape_of_lengths = lengths.get_shape()
    shape_of_target = reduce_target.get_shape()
    if len(shape_of_lengths) != dim:
        raise ValueError(('Second input tensor should be rank %d, ' +
                         'while it got rank %d') % (dim, len(shape_of_lengths)))
    if len(shape_of_target) < dim+1 :
        raise ValueError(('First input tensor should be at least rank %d, ' +
                         'while it got rank %d') % (dim+1, len(shape_of_target)))

    rank_diff = len(shape_of_target) - len(shape_of_lengths) - 1 # 计算秩差
    # 生成掩码
    mxlen = tf.shape(reduce_target)[dim] 
    mask = mkMask(lengths, mxlen) 
    if rank_diff!=0:
        len_shape = tf.concat(axis=0, values=[tf.shape(lengths), [1]*rank_diff])
        mask_shape = tf.concat(axis=0, values=[tf.shape(mask), [1]*rank_diff])
    else:
        len_shape = tf.shape(lengths)
        mask_shape = tf.shape(mask)
    lengths_reshape = tf.reshape(lengths, shape=len_shape)
    mask = tf.reshape(mask, shape=mask_shape)
    # 计算加权和
    mask_target = reduce_target * tf.cast(mask, dtype=reduce_target.dtype)
    red_sum = tf.reduce_sum(mask_target, axis=[dim], keepdims=False)
    # 计算平均值
    red_avg = red_sum / (tf.cast(lengths_reshape, dtype=tf.float32) + 1e-30)
    return red_avg

# 定义loss函数，定义训练函数

In [26]:
@tf.function
def compute_loss(logits, labels, seqlen):
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels)
    losses = reduce_avg(losses, seqlen, dim=1)
    return tf.reduce_mean(losses)

@tf.function(reduce_retracing=True)
def train_one_step(model, optimizer, x, y, seqlen):
    # print("x shape:", x.shape, "x dtype:", x.dtype)
    '''
    完成一步优化过程，可以参考之前做过的模型
    '''
    with tf.GradientTape() as tape:
        # 前向传播
        logits = model(x)
        # 计算损失
        loss = compute_loss(logits, y, seqlen)
    # 计算损失对模型参数的梯度
    gradients = tape.gradient(loss, model.trainable_variables)
    # 更新参数
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    return loss

def train(epoch, model, optimizer, ds):
    loss = 0.0
    accuracy = 0.0
    for step, (x, y, seqlen) in enumerate(ds):
        loss = train_one_step(model, optimizer, x, y, seqlen)

        if step % 500 == 0:
            print('epoch', epoch, ': loss', loss.numpy())

    return loss

# 训练优化过程

In [27]:
optimizer = optimizers.Adam(0.0005)
train_ds, word2id, id2word = poem_dataset()
model = myRNNModel(word2id)

for epoch in range(10):
    loss = train(epoch, model, optimizer, train_ds)

epoch 0 : loss 8.820801
epoch 1 : loss 6.610962
epoch 2 : loss 6.23307
epoch 3 : loss 5.90259
epoch 4 : loss 5.6982703
epoch 5 : loss 5.5709143
epoch 6 : loss 5.4467945
epoch 7 : loss 5.4175086
epoch 8 : loss 5.3035693
epoch 9 : loss 5.2295833


# 生成过程

In [29]:
def gen_sentence():
    state = [tf.random.normal(shape=(1, 128), stddev=0.5), tf.random.normal(shape=(1, 128), stddev=0.5)]
    cur_token = tf.constant([word2id['bos']], dtype=tf.int32)
    collect = []
    for _ in range(50):
        cur_token, state = model.get_next_token(cur_token, state)
        collect.append(cur_token.numpy()[0])
    return [id2word[t] for t in collect]
print(''.join(gen_sentence()))

三年不可知，此时无事事。eos生不可见，不见天中去。eos然不可忘，不见无人间。eos然不可见，不见不可知。eos子


In [45]:
def gen_sentence_for_begin_word(begin_word):
    state = [tf.random.normal(shape=(1, 128), stddev=0.5), tf.random.normal(shape=(1, 128), stddev=0.5)]
    cur_token = tf.constant([word2id[begin_word]], dtype=tf.int32)
    collect = [cur_token.numpy()[0]]
    for _ in range(50):
        cur_token, state = model.get_next_token(cur_token, state)
        collect.append(cur_token.numpy()[0])
    return [id2word[t] for t in collect]
print(''.join(gen_sentence_for_begin_word('日')) + '\n')
print(''.join(gen_sentence_for_begin_word('红'))+ '\n')
print(''.join(gen_sentence_for_begin_word('山'))+ '\n')
print(''.join(gen_sentence_for_begin_word('夜'))+ '\n')
print(''.join(gen_sentence_for_begin_word('湖'))+ '\n')
print(''.join(gen_sentence_for_begin_word('海'))+ '\n')
print(''.join(gen_sentence_for_begin_word('月'))+ '\n')

日日，一声何处不知。eos子不知何处事，不知何处是何人。eos道不知何处事，不知何处是何人。eos子不知何处事，不

红叶滴红花满花。eos有一时无处处，一年何处不知人。eos来不得无人事，不得人间不可知。eos道不知何处事，不知何

山边雨满江风。eos落花声落，风风落月深。eos来无处处，不见此中时。eos子无人事，何人不可知。eos来无处处，不见

夜暮，一片月中春。eos色不知，此时何。eos子不知，此时何。eos子不知，不得之》）eos，一时不可知。eos中无处事，

湖上，无人在何人。eos来不得无人事，不得人间不可知。eos道不知何处事，不知何处是何人。eos子不知何处事，不知

海，今日无人不可知。eos道不知何处事，不知何处是何人。eos子不知何处事，不知何处是何人。eos子不知何处事，不

月侵苔叶，风雨满花声。eos客无人事，何人不可知。eos来无处处，不见此中时。eos子无人事，何人不可知。eos来无处

