In [3]:
# 数据分析/处理
import numpy as np
import pandas as pd
import re

# 搭建神经网络
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.init as init
from torch import optim
from torch.utils.data import Dataset,DataLoader

# 数据可视化
import matplotlib.pyplot as plt
import warnings

# word2vec
from gensim.models import Word2Vec


warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
# 验证cuda是否可用
cuda_available=torch.cuda.is_available()
device = torch.device("cuda" if cuda_available else "cpu")
if cuda_available:
    print("CUDA Device Name:", torch.cuda.get_device_name(0))
    print("CUDA Compute Capability:", torch.cuda.get_device_capability(0))
# 宇宙的答案
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x22e439f5090>

### 读入数据

由于是中文诗词，且需要进行字符级建模，所以需要搭建字典并切分句子。


In [45]:
# 首先将整个语料库读入，并对其进行清洗和重新划分。
poe=open("poetryFromTang.txt",'r')
poem=poe.read()
poe.close()
# 将诗词按句号划分
tmp=poem.replace(poem[0],"")
tmp=tmp.replace("。","。\n")
# 检查有无乱码
newFile=open("NewPoetryFromTang.txt",'w')
newFile.write(tmp)
newFile.close()

In [5]:
corprus=pd.read_csv("poetryFromTang.txt",header=None,delimiter='\t')
corprus.head()

Unnamed: 0,0
0,巴山上峡重复重，阳台碧峭十二峰。荆王猎时逢暮雨，
1,夜卧高丘梦神女。轻红流烟湿艳姿，行云飞去明星稀。
2,目极魂断望不见，猿啼三声泪沾衣。
3,见尽数万里，不闻三声猿。但飞萧萧雨，中有亭亭魂。
4,千载楚襄恨，遗文宋玉言。至今青冥里，云结深闺门。


In [16]:
max_len=len(corprus[0].max())

def sen2list(sentence,max_len=max_len):
    l=[w for w in sentence]
    return l
s=sen2list("君问归期未有期，巴山夜雨涨秋池。")



['君',
 '问',
 '归',
 '期',
 '未',
 '有',
 '期',
 '，',
 '巴',
 '山',
 '夜',
 '雨',
 '涨',
 '秋',
 '池',
 '。']

### 生成式语言模型建模建模

使用LSTM和GRU搭建语言模型。使用预测下一个token的方式生成，这种自回归生成方法是目前最主流的生成方法。

具体可以参考AK的[字符级语言模型教程](https://www.youtube.com/watch?v=PaCmpygFfXo&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=2&ab_channel=AndrejKarpathy)。

In [3]:
# 定义模型
class GRU(nn.Module):
    def __init__(self,d_model,hidden_size,num_layer,output_size,dropout=0.1,bid=False) -> None:
        super().__init__()
        self.d_model=d_model
        self.hidden_size=hidden_size
        self.layers=num_layer
        self.bid=bid
        
        self.rnn=nn.GRU(d_model,hidden_size,num_layer,\
                        batch_first=True,dropout=dropout,bidirectional=bid)
        self.classfier=nn.Sequential(
            nn.Linear(hidden_size,output_size),
            nn.ReLU(),
            nn.Softmax(output_size)
        )

    def forward(self,X):
        if(self.bid==True):
            h0=torch.zeros(X.size(0),2*self.num_layer,self.hidden_size)
        else:
            h0=torch.zeros(X.size(0),self.num_layer,self.hidden_size)
        X,h_n=self.rnn(X,h0)
        out=self.classfier(X[:,-1,:])
        return out


class LSTM(nn.Module):
    def __init__(self,d_model,hidden_size,num_layer,output_size,dropout=0.1,bid=False) -> None:
        super().__init__()
        self.d_model=d_model
        self.hidden_size=hidden_size
        self.layer=num_layer
        self.bid=bid
        
        self.rnn=nn.LSTM(d_model,hidden_size,num_layer,\
                        batch_first=True,dropout=dropout,bidirectional=bid)
        # self.classfier=nn.Sequential(
        #     nn.Linear(hidden_size,output_size),
        #     nn.ReLU(),
        #     nn.Softmax(output_size)
        # )

    def forward(self,X):
        if(self.bid==True):
            h0=torch.zeros(2*self.layer,X.size(0),self.hidden_size)
            c0=torch.zeros(2*self.layer,X.size(0),self.hidden_size)
        else:
            h0=torch.zeros(self.layer,X.size(0),self.hidden_size)
            c0=torch.zeros(self.layer,X.size(0),self.hidden_size)
        X,(hn,cn)=self.rnn(X,(h0,c0))
        # out=self.classfier(X[:,-1,:])
        return X,hn,cn