In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import time
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import *
from torch.utils.data import DataLoader

BATCH_SIZE = 16
EPOCHS = 4 
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [2]:
# 读入预处理的数据
datas = np.load("./tang.npz")
data = datas['data']
ix2word = datas['ix2word'].item()
word2ix = datas['word2ix'].item()
    
# 转为torch.Tensor
data = torch.from_numpy(data)
train_loader = DataLoader(data, batch_size = BATCH_SIZE, shuffle = True, num_workers = 2)

In [3]:
class PoetryModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(PoetryModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, num_layers=2)
        self.linear = nn.Linear(self.hidden_dim, vocab_size)

    def forward(self, input, hidden = None):
        seq_len, batch_size = input.size()
        
        if hidden is None:
            h_0 = input.data.new(2, batch_size, self.hidden_dim).fill_(0).float()
            c_0 = input.data.new(2, batch_size, self.hidden_dim).fill_(0).float()
        else:
            h_0, c_0 = hidden

        embeds = self.embedding(input)
        output, hidden = self.lstm(embeds, (h_0, c_0))
        output = self.linear(output.view(seq_len * batch_size, -1))
        return output, hidden

In [4]:
# 配置模型，是否继续上一次的训练
model = PoetryModel(len(word2ix),embedding_dim = 128,hidden_dim = 256)

model_path = None          # 预训练模型路径
if model_path:
    model.load_state_dict(torch.load(model_path))
model.to(DEVICE)
    
optimizer = torch.optim.Adam(model.parameters(), lr = 5e-3)
# optimizer = torch.optim.SGD(model.parameters(), lr=5e-3, momentum=0.9, weight_decay=5e-4)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=4)

In [5]:
def train(model, dataloader, ix2word, word2ix, device, optimizer, scheduler, epoch):
    model.train()
    train_loss = 0.0
    
    for batch_idx, data in enumerate(dataloader):
        data = data.long().transpose(1, 0).contiguous()
        data = data.to(device)
        optimizer.zero_grad()
        input, target = data[:-1, :], data[1:, :]
        output, _ = model(input)
        loss = criterion(output, target.view(-1))
        loss.backward()  
        optimizer.step()
        train_loss += loss.item()
            
        if (batch_idx+1) % 200 == 0:
            print('train epoch: {} [{}/{} ({:.0f}%)]\tloss: {:.6f}'.format(
                epoch, batch_idx * len(data[1]), len(dataloader.dataset),
                100. * batch_idx / len(dataloader), loss.item()))
            
    train_loss *= BATCH_SIZE
    train_loss /= len(train_loader.dataset)
    print('\ntrain epoch: {}\t average loss: {:.6f}\n'.format(epoch,train_loss))
    scheduler.step()
    
    return train_loss

In [6]:
train_losses = []

for epoch in range(1,EPOCHS+1):
    tr_loss = train(model,train_loader,ix2word,word2ix,DEVICE,optimizer,scheduler,epoch)
    train_losses.append(tr_loss)
    
# 保存模型
filename = "model" + str(time.time()) + ".pth"
torch.save(model.state_dict(), filename) 


train epoch: 1	 average loss: 2.453792


train epoch: 2	 average loss: 2.180018


train epoch: 3	 average loss: 2.094503


train epoch: 4	 average loss: 2.040116



In [15]:
# 读取模型
model_path = filename      # 模型路径
model = PoetryModel(len(word2ix),embedding_dim = 128,hidden_dim = 256)
model.load_state_dict(torch.load(model_path))
model.to(DEVICE)

PoetryModel(
  (embedding): Embedding(8293, 128)
  (lstm): LSTM(128, 256, num_layers=2)
  (linear): Linear(in_features=256, out_features=8293, bias=True)
)

In [16]:
def generate(model, start_words, ix2word, word2ix, max_gen_len, prefix_words=None):
    # 读取唐诗的第一句
    results = list(start_words)
    start_word_len = len(start_words)
    
    # 设置第一个词为<START>
    input = torch.Tensor([word2ix['<START>']]).view(1, 1).long()
    input = input.to(DEVICE)
    hidden = None
    
    if prefix_words:
        for word in prefix_words:
            output, hidden = model(input, hidden)
            input = input.data.new([word2ix[word]]).view(1, 1)

    # 生成唐诗
    for i in range(max_gen_len):
        output, hidden = model(input, hidden)
        # 读取第一句
        if i < start_word_len:
            w = results[i]
            input = input.data.new([word2ix[w]]).view(1, 1)
        # 生成后面的句子
        else:
            top_index = output.data[0].topk(1)[1][0].item()
            w = ix2word[top_index]
            results.append(w)
            input = input.data.new([top_index]).view(1, 1)
        # 结束标志
        if w == '<EOP>':
            del results[-1]
            break
            
    return results

In [19]:
start_words = '白日依山尽'  # 唐诗的第一句
max_gen_len = 128         # 生成唐诗的最长长度

prefix_words = None
results = generate(model, start_words, ix2word, word2ix, max_gen_len, prefix_words)
poetry = ''
for i in results:
    poetry += i
    if i == '。':
        poetry += '\n'
        
print(poetry)

白日依山尽，江南见旧游。
江湖春草绿，山色白云生。
野渡青山远，江深月色微。
云山寒水阔，山色白云生。
野渡青山远，山深竹树新。
野花开白发，山鸟有余情。
白发今如此，青山不可寻。
一声闻不得，万里独悠悠。
远客何时见，孤舟望远公。
高楼望乡路，回首望乡情。

