<a href="https://colab.research.google.com/github/BossRobin/DeepLearningAlgorithmsByPytorch/blob/master/NLG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import torch
import os
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_ # 梯度裁剪

class Dictionary(object):
  def __init__(self):
    self.word2idx = {}
    self.idx2word = {}
    self.idx = 0
  
  def add_word(self, word):
    if not word in self.word2idx:
      self.word2idx[word] = self.idx
      self.idx2word[self.idx] = word
      self.idx += 1
  
  def __len__(self):
    return len(self.word2idx)


class Corpus(object): # 语料库类
  def __init__(self):
    self.dictionary = Dictionary() # 语料库对应的字典

  def get_data(self, path, batch_size=20):
    # Add words to the dictionary
    with open(path, 'r') as f:
      tokens = 0
      for line in f:
        words = line.split() + ['<eos>']
        tokens += len(words)
        for word in words: 
          self.dictionary.add_word(word)  

    # Tokenize the file content
    ids = torch.LongTensor(tokens) # LongTensor是pytorch中的一种类型， tokens大小为语料库的大小加上结束符的个数
    token = 0
    with open(path, 'r') as f:
      for line in f:
        words = line.split() + ['<eos>']
        for word in words:
          ids[token] = self.dictionary.word2idx[word] # 遍历语料库，将所有单词序列化
          token += 1

    num_batches = ids.size(0) // batch_size # 因为ids即tokens的大小可能无法整除batch_size，所以将最后一部分截掉
    ids = ids[:num_batches*batch_size]
    return ids.view(batch_size, -1) # 返回语料库序列化后按照batch_size大小分块的数据

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
embed_size = 128 # embedding层的节点数，即词嵌入层的维度
hidden_size = 1024
num_layers = 1
num_epochs = 5
num_samples = 1000 # 最终生成词的数量
batch_size = 20 # 一个batch的大小
seq_length = 30 # 每个batch中的一个seq的长度
learning_rate = 0.002

In [68]:
corpus = Corpus()
ids = corpus.get_data('data/train.txt', batch_size)
vocab_size = len(corpus.dictionary) # 字典大小
num_batches = ids.size(1) # batch数

10000


In [0]:
class RNNLM(nn.Module):
  def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
    super(RNNLM, self).__init__()
    self.embed = nn.Embedding(vocab_size, embed_size) # embedding层
    self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True) # lstm层
    self.linear = nn.Linear(hidden_size, vocab_size) # 全连接层
  
  def forward(self, x, h):
    x = self.embed(x)
    out, (h, c) = self.lstm(x, h) # out的size是torch.Size([20, 30, 1024]),20是batch_size，30是seq_length，1024是隐藏层的维度
    out = out.reshape(out.size(0)*out.size(1), out.size(2)) # 需要将out进行转换，以输入全连接层。转换方法是将一个batch的序列合成一串。

    out = self.linear(out)
    return out, (h, c)
    

In [0]:
model = RNNLM(vocab_size, embed_size, hidden_size, num_layers).to(device)

In [0]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def detach(states):
  return [state.detach() for state in states] # detach()就是截断反向传播的梯度流

In [0]:
for epoch in range(num_epochs):
  states = (torch.zeros(num_layers, batch_size, hidden_size).to(device),
        torch.zeros(num_layers, batch_size, hidden_size).to(device)) # states 相当于h0,c0

  for i in range(0, ids.size(1) - seq_length, seq_length): # ids的size为torch.Size([20, 46479])。将输入流分割成[20, 30]大小的数据作为一个批次的inputs
    inputs = ids[:, i:i+seq_length].to(device)
    targets = ids[:, (i+1):(i+1)+seq_length].to(device)  # targets是每个单词的下一个单词，因此此程序的目的是完成生成任务

    states = detach(states)  # 将h和c的梯度归0？
    outputs, states = model(inputs, states)
    loss = criterion(outputs, targets.reshape(-1))

    optimizer.zero_grad()
    loss.backward()
    clip_grad_norm_(model.parameters(), 0.5)  # 进行梯度截断，防止梯度爆炸或者梯度消失
    optimizer.step()

    step = (i+1) // seq_length
    if step % 100 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {:.4f}, Perplexity: {:5.2f}'
                   .format(epoch+1, num_epochs, step, num_batches // seq_length, loss.item(), np.exp(loss.item())))


In [0]:
with torch.no_grad():
  with open('sample.txt', 'w') as f:
    state = (torch.zeros(num_layers, 1, hidden_size).to(device),
          torch.zeros(num_layers, 1, hidden_size).to(device))
    # 随机选择一个词作为输入
    prob = torch.ones(vocab_size)
    input = torch.multinomial(prob, num_samples=1).unsqueeze(1).to(device)
    # torch.multinomial(input, num_samples,replacement=False, out=None) → LongTensor
    # 作用是对input的每一行做n_samples次取值，输出的张量是每一次取值时input张量对应行的下标。
    # input张量可以看成一个权重张量，每一个元素代表其在该行中的权重。如果有元素为0，那么在其他不为0的元素
    # 被取干净之前，这个元素是不会被取到的。
    # n_samples是每一行的取值次数，该值不能大于每一样的元素数，否则会报错。
    # replacement指的是取样时是否是有放回的取样，True是有放回，False无放回。

    for i in range(num_samples):
      output, state = model(input, state)
      prob = output.exp() # 将output的值取e的对数，即转化为概率
      word_id = torch.multinomial(prob, num_samples=1).item()

      input.fill_(word_id) # 将input中的内容替换为新的词
      word = corpus.dictionary.idx2word[word_id]
      word = '\n' if word == '<eos>' else word + ' '
      f.write(word)

      if (i+1) % 100 == 0:
        print('Sample [{}/{}] words and save to {}'.format(i+1, num_samples, 'sample.txt'))