# 6.5循环网络的简洁实现

In [1]:
import torch
import torch.nn as nn
import time
import math
import sys
sys.path.append(".data/dive")
import d2lzh_pytorch as d2l
(corpus_indices, char_to_idx, idx_to_char, vocab_size) = d2l.load_data_jay_lyrics()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 6.5.1定义模型

In [2]:
num_hiddens = 256
rnn_layer = nn.RNN(input_size=vocab_size, hidden_size=num_hiddens)
# nn.RNN参数：
# input_size：X的特征数
# hidden_size：隐藏单元h的特征数
# nonlinearity：tanh或者relu
# batch_first：True：输入和输出为(batch_size,num_steps,input_size)
# False：(num_steps, batch_size, input_size)

# 前向传播不涉及输出层计算！
# forward参数：
# input：(num_steps, batch_size, input_size)
# h_0：(num_layers * num_directions, batch_size, hidden_size)

# forward返回值：
# output：(num_steps, batch_size, num_directions * hidden_size)
# h_n：(num_layers * num_directions, batch_size, hidden_size)
# output：隐藏层在各个时间步上计算并输出的隐藏状态，作为后续输出层的输入
# h_n：隐藏层在最后时间步的隐藏状态

In [3]:
num_steps, batch_size = 35, 2
X = torch.rand(num_steps, batch_size, vocab_size)
state = None
Y, state_new = rnn_layer(X, state) 
# Y.shape：(num_steps, batch_size, hidden_size)；state_new：(1[固定,层数],batch_size,hidden_size)
print(Y.shape, state_new.shape)

torch.Size([35, 2, 256]) torch.Size([1, 2, 256])


In [4]:
class RNNModel(nn.Module):
    '''
    将输入数据使用one-hot向量表示后输入daornn_layer，
    使用全连接输出层得到输出
    #output=vocab_size
    '''
    def __init__(self, rnn_layer, vocab_size):
        '''
        @rnn_layer：实例
        '''
        super(RNNModel, self).__init__()
        self.rnn = rnn_layer
        self.hidden_size = rnn_layer.hidden_size * (2 if rnn_layer.bidirectional else 1) 
        self.vocab_size = vocab_size
        self.dense = nn.Linear(self.hidden_size, vocab_size) # 线性层作为输出层

    def forward(self, inputs, state):
        '''
        inputs: (batch_size, num_steps)
        '''
        X = d2l.to_onehot(inputs, vocab_size) # 获取one-hot向量表示，长度为num_steps的列表
        X = torch.stack(X)  # X.shape: (num_steps, batch_size, vocab_size)
        hiddens, state = self.rnn(X, state)
        hiddens = hiddens.view(-1, hiddens.shape[-1])  
        # hiddens.shape: (num_steps * batch_size, hidden_size)
        output = self.dense(hiddens) # output.shape:(num_steps*batch_size,vocab_size)
        return output, state

## 6.5.2训练模型

In [5]:
def predict_rnn_pytorch(prefix, num_chars, model, vocab_size, device, idx_to_char,
                      char_to_idx):
    state = None
    output = [char_to_idx[prefix[0]]]  # output记录prefix加上输出
    for t in range(num_chars + len(prefix) - 1):
        X = torch.tensor([output[-1]], device=device).view(1, 1)
        (Y, state) = model(X, state)  # 前向计算不需要传入模型参数
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(Y.argmax(dim=1).item())
    return ''.join([idx_to_char[i] for i in output])

In [6]:
model = RNNModel(rnn_layer, vocab_size).to(device)
predict_rnn_pytorch('分开', 10, model, vocab_size, device, idx_to_char, char_to_idx)

'分开田其亮卷漫卷漫卷漫卷'

In [7]:
# 相邻采样读取数据
def train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                                corpus_indices, idx_to_char, char_to_idx,
                                num_epochs, num_steps, lr, clipping_theta,
                                batch_size, pred_period, pred_len, prefixes):
    loss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    for epoch in range(num_epochs):
        l_sum, n, start = 0.0, 0, time.time()
        data_iter = d2l.data_iter_consecutive(corpus_indices, batch_size, num_steps, device) # 相邻采样
        state = None
        for X, Y in data_iter:
            if state is not None:
                # 使用detach函数从计算图分离隐藏状态
                # 使模型参数的梯度计算只依赖一次迭代读取的小批量序列（防止梯度开销太大）
                if isinstance (state, tuple): # LSTM.state:(h, c)  
                    state[0].detach_()
                    state[1].detach_()
                else: 
                    state.detach_()
            (output, state) = model(X, state) 
            # output.shape: (num_steps * batch_size, vocab_size)
            # y.shape：(batch_size,num_steps)->batch_size*num_steps
            y = torch.flatten(Y.T)
            l = loss(output, y.long())
            
            optimizer.zero_grad()
            l.backward()
            grad_clipping(model.parameters(), clipping_theta, device)
            optimizer.step()
            l_sum += l.item() * y.shape[0]
            n += y.shape[0]
        

        if (epoch + 1) % pred_period == 0:
            print('epoch %d, perplexity %f, time %.2f sec' % (
                epoch + 1, math.exp(l_sum / n), time.time() - start))
            for prefix in prefixes:
                print(' -', predict_rnn_pytorch(
                    prefix, pred_len, model, vocab_size, device, idx_to_char,
                    char_to_idx))

In [None]:
num_epochs, batch_size, lr, clipping_theta = 250, 32, 1e-3, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']
train_and_predict_rnn_pytorch(model, num_hiddens, vocab_size, device,
                            corpus_indices, idx_to_char, char_to_idx,
                            num_epochs, num_steps, lr, clipping_theta,
                            batch_size, pred_period, pred_len, prefixes)