# 深度循环神经网络FNN 歌词文本处理
在深度学习应用里，我们通常会用到含有多个隐藏层的循环神经网络，也称作深度循环神经网络。图6.11演示了一个有$L$个隐藏层的深度循环神经网络，每个隐藏状态不断传递至当前层的下一时间步和当前时间步的下一层。

具体来说，在时间步$t$里，设小批量输入$\boldsymbol{X}_t \in \mathbb{R}^{n \times d}$（样本数为$n$，输入个数为$d$），第$l$隐藏层（$l=1,\ldots,L$）的隐藏状态为$\boldsymbol{H}_t^{(l)} \in \mathbb{R}^{n \times h}$（隐藏单元个数为$h$），输出层变量为$\boldsymbol{O}_t \in \mathbb{R}^{n \times q}$（输出个数为$q$），且隐藏层的激活函数为$\phi$。第1隐藏层的隐藏状态和之前的计算一样：

$$\boldsymbol{H}t^{(1)} = \phi(\boldsymbol{X}t \boldsymbol{W}{xh}^{(1)} + \boldsymbol{H}{t-1}^{(1)} \boldsymbol{W}_{hh}^{(1)} + \boldsymbol{b}_h^{(1)}),$$

其中权重$\boldsymbol{W}{xh}^{(1)} \in \mathbb{R}^{d \times h}$、$\boldsymbol{W}{hh}^{(1)} \in \mathbb{R}^{h \times h}$和偏差 $\boldsymbol{b}_h^{(1)} \in \mathbb{R}^{1 \times h}$分别为第1隐藏层的模型参数。

当$1 < l \leq L$时，第$l$隐藏层的隐藏状态的表达式为

$$\boldsymbol{H}t^{(l)} = \phi(\boldsymbol{H}t^{(l-1)} \boldsymbol{W}{xh}^{(l)} + \boldsymbol{H}{t-1}^{(l)} \boldsymbol{W}_{hh}^{(l)} + \boldsymbol{b}_h^{(l)}),$$

其中权重$\boldsymbol{W}{xh}^{(l)} \in \mathbb{R}^{h \times h}$、$\boldsymbol{W}{hh}^{(l)} \in \mathbb{R}^{h \times h}$和偏差 $\boldsymbol{b}_h^{(l)} \in \mathbb{R}^{1 \times h}$分别为第$l$隐藏层的模型参数。

最终，输出层的输出只需基于第$L$隐藏层的隐藏状态：

$$\boldsymbol{O}_t = \boldsymbol{H}t^{(L)} \boldsymbol{W}{hq} + \boldsymbol{b}_q,$$

其中权重$\boldsymbol{W}_{hq} \in \mathbb{R}^{h \times q}$和偏差$\boldsymbol{b}_q \in \mathbb{R}^{1 \times q}$为输出层的模型参数。

同多层感知机一样，隐藏层个数$L$和隐藏单元个数$h$都是超参数。此外，如果将隐藏状态的计算换成门控循环单元或者长短期记忆的计算，我们可以得到深度门控循环神经网络。
## 歌词创作
收集了周杰伦从第一张专辑《Jay》到第十张专辑《跨时代》中的歌词，并应用循环神经网络来训练一个语言模型。当模型训练好后，然后就可以用这个模型来创作歌词。

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, losses
import numpy as np 
import pandas as pd 
import plotly as py 
import plotly.graph_objects as go 
import math 
import time
import sys 
import zipfile
print('Tensorflow version:', tf.__version__)
print('Numpy version:', np.__version__)
print('Pandas version:', pd.__version__)
print('Plotly version:', py.__version__)

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

In [28]:
class DRNN():
    def __init__(self):
        pass

    def load_data(self, path ='./Data/jaychou_lyrics.txt.zip'):
        with zipfile.ZipFile(path) as zin:
            with zin.open('jaychou_lyrics.txt') as f:
                corpus_chars = f.read().decode('utf-8')
        corpus_chars[:40]
        # 这个数据集有6万多个字符。为了打印方便，我们把换行符替换成空格，然后仅使用前1万个字符来训练模型
        corpus_chars = corpus_chars.replace('\n', ' ').replace('\r', ' ')
        corpus_chars = corpus_chars[0:10000]
        # 建立字符索引
        idx_to_char = list(set(corpus_chars))
        char_to_idx = dict([(char, i) for i, char in enumerate(idx_to_char)])
        vocab_size = len(char_to_idx)
        #vocab_size # 1027
        
        return corpus_chars, char_to_idx, idx_to_char, vocab_size
    
    # 随机采样
    def data_iter_random(self, corpus_indices, batch_size, num_steps, ctx=None):
        # 减1是因为输出的索引是相应输入的索引加1
        num_examples = (len(corpus_indices) - 1) // num_steps
        epoch_size = num_examples // batch_size
        example_indices = list(range(num_examples))
        random.shuffle(example_indices)

        # 返回从pos开始的长为num_steps的序列
        def _data(self, pos):
            return corpus_indices[pos: pos + num_steps]

        for i in range(epoch_size):
            # 每次读取batch_size个随机样本
            i = i * batch_size
            batch_indices = example_indices[i: i + batch_size]
            X = [_data(j * num_steps) for j in batch_indices]
            Y = [_data(j * num_steps + 1) for j in batch_indices]
            yield np.array(X, ctx), np.array(Y, ctx)
            
    # 相邻采样
    def data_iter_consecutive(self, corpus_indices, batch_size, num_steps, ctx=None):
        corpus_indices = np.array(corpus_indices)
        data_len = len(corpus_indices)
        batch_len = data_len // batch_size
        indices = corpus_indices[0: batch_size*batch_len].reshape((
            batch_size, batch_len))
        epoch_size = (batch_len - 1) // num_steps
        for i in range(epoch_size):
            i = i * num_steps
            X = indices[:, i: i + num_steps]
            Y = indices[:, i + 1: i + num_steps + 1]
            yield X, Y

    def predict_rnn_keras(self, prefix, num_chars, model, vocab_size, idx_to_char,
                          char_to_idx):
        # 使用model的成员函数来初始化隐藏状态
        state = model.get_initial_state(batch_size=1,dtype=tf.float32)
        output = [char_to_idx[prefix[0]]]
        #print("output:",output)
        for t in range(num_chars + len(prefix) - 1):
            X = np.array([output[-1]]).reshape((1, 1))
            #print("X",X)
            Y, state = model(X, state)  # 前向计算不需要传入模型参数
            #print("Y",Y)
            #print("state:",state)
            if t < len(prefix) - 1:
                output.append(char_to_idx[prefix[t + 1]])
                #print(char_to_idx[prefix[t + 1]])
            else:
                output.append(int(np.array(tf.argmax(Y,axis=-1))))
                #print(int(np.array(tf.argmax(Y[0],axis=-1))))
        return ''.join([idx_to_char[i] for i in output])
    

    # 计算裁剪后的梯度
    def grad_clipping(self,grads,theta):
        norm = np.array([0])
        for i in range(len(grads)):
            norm+=tf.math.reduce_sum(grads[i] ** 2)
        #print("norm",norm)
        norm = np.sqrt(norm).item()
        new_gradient=[]
        if norm > theta:
            for grad in grads:
                new_gradient.append(grad * theta / norm)
        else:
            for grad in grads:
                new_gradient.append(grad)  
        #print("new_gradient",new_gradient)
        return new_gradient

    def train_and_predict_rnn_keras(self, model, num_hiddens, vocab_size, 
                                    corpus_indices, idx_to_char, char_to_idx,
                                    num_epochs, num_steps, lr, clipping_theta,
                                    batch_size, pred_period, pred_len, prefixes):
        loss = tf.keras.losses.SparseCategoricalCrossentropy()
        optimizer=tf.keras.optimizers.SGD(learning_rate=lr)

        for epoch in range(num_epochs):
            l_sum, n, start = 0.0, 0, time.time()
            data_iter = d2l.data_iter_consecutive(
                corpus_indices, batch_size, num_steps)
            state = model.get_initial_state(batch_size=batch_size,dtype=tf.float32)
            for X, Y in data_iter:
                with tf.GradientTape(persistent=True) as tape:
                    (outputs, state) = model(X, state)
                    y = Y.T.reshape((-1,))
                    l = loss(y,outputs)

                grads = tape.gradient(l, model.variables)
                # 梯度裁剪
                grads=grad_clipping(grads, clipping_theta)
                optimizer.apply_gradients(zip(grads, model.variables))  # 因为已经误差取过均值，梯度不用再做平均
                l_sum += np.array(l).item() * len(y)
                n += len(y)

            if (epoch + 1) % pred_period == 0:
                print('epoch %d, perplexity %f, time %.2f sec' % (
                    epoch + 1, math.exp(l_sum / n), time.time() - start))
                for prefix in prefixes:
                    print(' -', predict_rnn_keras(
                        prefix, pred_len, model, vocab_size,  idx_to_char,
                        char_to_idx))

class RNNModel(layers.Layer):
    def __init__(self, rnn_layer, vocab_size, **kwargs):
        super(RNNModel, self).__init__(**kwargs)
        self.rnn = rnn_layer
        self.vocab_size = vocab_size
        self.dense = layers.Dense(vocab_size)

    def call(self, inputs, state):
        # 将输入转置成(num_steps, batch_size)后获取one-hot向量表示
        X = tf.one_hot(tf.transpose(inputs), self.vocab_size)
        Y,state = self.rnn(X, state)
        # 全连接层会首先将Y的形状变成(num_steps * batch_size, num_hiddens)，它的输出
        # 形状为(num_steps * batch_size, vocab_size)
        output = self.dense(tf.reshape(Y,(-1, Y.shape[-1])))
        return output, state

    def get_initial_state(self, *args, **kwargs):
        return self.rnn.cell.get_initial_state(*args, **kwargs)



In [30]:
if __name__ == '__main__':
    batch_size = 2
    num_hiddens = 256
    cell = layers.SimpleRNNCell(num_hiddens,kernel_initializer='glorot_uniform')
    rnn_layer = layers.RNN(cell,time_major=True,return_sequences=True,return_state=True)
    drnn = DRNN()
    (corpus_indices, char_to_idx, idx_to_char, vocab_size) = drnn.load_data()

    rnnmodel = DRNN(rnn_layer, vocab_size)
    rnnmodel = RNNModel(rnn_layer, vocab_size)
    rnnmodel.num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e2, 1e-2
    pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
    rennmodel.train_and_predict_rnn_keras(model, num_hiddens, vocab_size, 
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes)

TypeError: __init__() takes 1 positional argument but 3 were given