In [1]:
import torch
from torch import nn,optim
from torch.utils import data
import numpy as np
import random
import zipfile

In [2]:
with zipfile.ZipFile("data/jaychou_lyrics.txt.zip") as zin:
    with zin.open("jaychou_lyrics.txt") as f:
        corpus_chars=f.read().decode("utf-8")
corpus_chars[:40]

'想要有直升机\n想要和你飞到宇宙去\n想要和你融化在一起\n融化在宇宙里\n我每天每天每'

In [3]:
corpus_chars=corpus_chars.replace("\n"," ").replace("\r"," ")
corpus_chars=corpus_chars[:10000]

In [7]:
corpus_chars[:40]

'想要有直升机 想要和你飞到宇宙去 想要和你融化在一起 融化在宇宙里 我每天每天每'

In [8]:
len(corpus_chars)

10000

In [4]:
id2char=list(set(corpus_chars))

In [5]:
char2id=dict([(v,k) for k,v in enumerate(id2char)])

In [16]:
type(char2id)

dict

In [6]:
vocab_num=len(id2char)

In [19]:
vocab_num

1027

In [7]:
corpus_index=[char2id[c] for c in corpus_chars]

In [23]:
sample=corpus_index[:20]
sample_chars="".join([id2char[i] for i in sample])
print(sample_chars)

想要有直升机 想要和你飞到宇宙去 想要和


In [27]:
def data_iter_random(corpus_indices,batch_size,num_steps,device=None):
    data_num=(len(corpus_indices)-1)//num_steps
    batch_num=data_num//batch_size
    example_indics=list(range(data_num))
    random.shuffle(example_indics)
    
    def _data(pos):
        return corpus_indices[pos:pos+num_steps]
    
    if device is None:
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    for i in range(batch_num):
        i=i*batch_size
        batch_indices=example_indics[i:i+batch_size]
        X=[_data(j*num_steps) for j in batch_indices]
        Y=[_data(j*num_steps+1) for j in batch_indices]
        
        yield torch.tensor(X,dtype=torch.float32,device=device),torch.tensor(Y,dtype=torch.float32,device=device)

In [28]:
my_seq=list(range(30))
for x,y in data_iter_random(my_seq,num_steps=6,batch_size=2):
    print(x,y)

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [12., 13., 14., 15., 16., 17.]]) tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [13., 14., 15., 16., 17., 18.]])
tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [18., 19., 20., 21., 22., 23.]]) tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [19., 20., 21., 22., 23., 24.]])


In [8]:
def data_iter_consecutive(corpus_indices,batch_size,num_steps,device=None):
    if device is None:
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    corpus_indices=torch.tensor(corpus_indices,dtype=torch.float32,device=device)
    batch_len=len(corpus_indices)//batch_size
    indices=corpus_indices[0:batch_size*batch_len].view(batch_size,batch_len)
    epoch_size=(batch_len-1)//num_steps
    for i in range(epoch_size):
        i=i*num_steps
        X=indices[:,i:i+num_steps]
        Y=indices[:,i+1:i+num_steps+1]
        yield X,Y

In [32]:
my_seq=list(range(30))
for x,y in data_iter_consecutive(my_seq,num_steps=6,batch_size=2):
    print(x,y)
    

tensor([[ 0.,  1.,  2.,  3.,  4.,  5.],
        [15., 16., 17., 18., 19., 20.]]) tensor([[ 1.,  2.,  3.,  4.,  5.,  6.],
        [16., 17., 18., 19., 20., 21.]])
tensor([[ 6.,  7.,  8.,  9., 10., 11.],
        [21., 22., 23., 24., 25., 26.]]) tensor([[ 7.,  8.,  9., 10., 11., 12.],
        [22., 23., 24., 25., 26., 27.]])


In [9]:
def one_hot(x,n_class,dtype=torch.float32):
    x=x.long()
    res=torch.zeros(x.shape[0],n_class,dtype=dtype,device=x.device)
    res.scatter_(1,x.view(-1,1),1)
    return res

In [34]:
x=torch.tensor([0,2])
one_hot(x,vocab_num)

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 1.,  ..., 0., 0., 0.]])

In [10]:
def to_onehot(X,n_class):
    return [one_hot(X[:,i],n_class) for i in range(X.shape[1])]

In [36]:
num_inputs,num_hiddens,num_outputs=vocab_num,256,vocab_num

In [38]:
def get_params():
    def _one(shape):
        ts=torch.tensor(np.random.normal(0,0.01,shape),dtype=torch.float32,device=device)
        return nn.Parameter(ts,requires_grad=True)
    
    W_hh=_one((num_hiddens,num_hiddens))
    W_xh=_one((num_inputs,num_hiddens))
    b_h=nn.Parameter(torch.zeros(num_hiddens,dtype=torch.float32,device=device))
    
    W_hy=_one((num_hiddens,num_outputs))
    b_y=nn.Parameter(torch.zeros(num_outputs,dtype=torch.float32,device=device))
    
    return [W_hh,W_xh,b_h,W_hy,b_y]

In [39]:
def init_rnn_state(batch_size,num_hiddens,device):
    return (torch.zeros((batch_size,num_hiddens),device=device),)

In [40]:
from torch.nn import functional as F

In [49]:
def rnn(inputs,state,params):
    W_hh,W_xh,b_h,W_hy,b_y=params
    H,=state
    output=[]
    for X in inputs:
        H=torch.tanh(torch.matmul(X,W_xh)+torch.matmul(H,W_hh)+b_h)
        Y=torch.matmul(H,W_hy)+b_y
        output.append(Y)
    return output,(H,)

In [43]:
X=torch.arange(10).view(2,5)
inputs=to_onehot(X,vocab_num)
print(len(inputs),inputs[0].shape)

5 torch.Size([2, 1027])


In [50]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
state=init_rnn_state(X.shape[0],num_hiddens,device)
inputs=to_onehot(X.to(device),vocab_num)
params=get_params()
outputs,state_new=rnn(inputs,state,params)
print(len(outputs),outputs[0].shape,state_new[0].shape)

5 torch.Size([2, 1027]) torch.Size([2, 256])


In [57]:
def predict_rnn(prefix,num_chars,rnn,params,state,num_hiddens,
                vocab_size,device,id2char,char2id):
    state=init_rnn_state(1,num_hiddens,device)
    outputs=[char2id[prefix[0]]]
    for t in range(num_chars+len(prefix)-1):
        X=to_onehot(torch.tensor([[outputs[-1]]],device=device),vocab_size)
        (Y,state)=rnn(X,state,params)
        if t<len(prefix)-1:
            outputs.append(char2id[prefix[t+1]])
        else:
            outputs.append(int(Y[0].argmax(dim=1).item()))
    return "".join([id2char[i] for i in outputs])

In [58]:
predict_rnn("分开",10,rnn,params,init_rnn_state,num_hiddens,vocab_num,device,id2char,char2id)

'分开蔓墟鲜直找誓被糗给呜'

In [59]:
def grad_clipping(params,theta,device):
    norm=torch.tensor([0.0],device=device)
    for param in params:
        norm+=(param.grad.data**2).sum()
    norm=norm.sqrt().item()
    if norm > theta:
        for param in params:
            param.grad.data*=(theta/norm)

In [65]:
import time

In [67]:
def train_and_predict_rnn(rnn,get_params,init_rnn_state,num_hiddens,vocab_size,device,
                          corpus_indices,id2char,char2id,is_random_iter,num_epochs,num_steps
                          ,lr,clipping_theta,batch_size,pred_period,pre_len,prefixes):
    if is_random_iter:
        data_iter_fn=data_iter_random
    else:
        data_iter_fn=data_iter_consecutive
    params=get_params()
    loss=nn.CrossEntropyLoss()
    optimizer=optim.SGD(params,lr)
    
    for epoch in range(num_epochs):
        if not is_random_iter:
            state=init_rnn_state(batch_size,num_hiddens,device)
        l_sum,n,start=0.0,0,time.time()
        data_iter=data_iter_fn(corpus_indices,batch_size,num_steps,device)
        for X,Y in data_iter:
            if is_random_iter:
                state=init_rnn_state(batch_size,num_hiddens,device)
            else:
                for s in state:
                    s.detach_()
            inputs=to_onehot(X,vocab_size)
            outputs,state=rnn(inputs,state,params)
            outputs=torch.cat(outputs,dim=0)
            y=torch.transpose(Y,0,1).contiguous().view(-1)
            l=loss(outputs,y.long())
            
            if params[0].grad is not None:
                for param in params:
                    param.grad.data.zero_()
            l.backward()
            grad_clipping(params,clipping_theta,device)
            optimizer.step()
            l_sum+=l.item()*y.shape[0]
            n+=y.shape[0]
        if (epoch+1)%pred_period==0:
            print("epoch %d,loss %f,time %.2f sec" %(epoch+1,l_sum/n,time.time()-start))
            for prefix in prefixes:
                print("-",predict_rnn(prefix,pred_len,rnn,params,init_rnn_state,
                                     num_hiddens,vocab_size,device,id2char,char2id))

In [62]:
num_epochs, num_steps, batch_size, lr, clipping_theta = 250, 35, 32, 1e2, 1e-2
pred_period, pred_len, prefixes = 50, 50, ['分开', '不分开']

In [68]:
train_and_predict_rnn(rnn, get_params, init_rnn_state, num_hiddens,
                      vocab_num, device, corpus_index, id2char,
                      char2id, True, num_epochs, num_steps, lr,
                      clipping_theta, batch_size, pred_period, pred_len,
                      prefixes)

epoch 50,loss 4.210980,time 2.58 sec
- 分开 我不要再想 我不能 想你 我不要我想多 我不要你的 我想能 你爱 我不要我想 我不要再想 我不能 
- 不分开 我想想你的  不知 你不么 爱什么 我想要你的  不知 你不么 我爱 我不要再不 我不要再想 我不
epoch 100,loss 2.333604,time 2.52 sec
- 分开 一颗两 干步的话旧语言 娘你的让我疯狂的可爱女人 坏坏的让我疯狂的可爱女人 坏坏的让我疯狂的可爱女
- 不分开吗 我爱你的爱你在西元前 深埋的美索著多 牧草有没有 我马儿有些瘦 我不要的可活 我知道好 说知我 
epoch 150,loss 1.037343,time 2.75 sec
- 分开 一只用它心步我 我想想这样牵着  这到你说你堡  说去 有 我想好有些瘦 我想要这生远 后知 这去
- 不分开吗 我不能再想 我不 我不 我不要再想你 不知不觉 你已经离开我 不知不觉 我跟了这节奏 后知后觉 
epoch 200,loss 0.440541,time 2.50 sec
- 分开 还金底经心 谁人它 岩烧店的烟味弥漫 隔壁是国术馆 店里面的妈妈桑 茶领刀人跟棍棒 我想耍的有模有
- 不分开扫 我后你爸 你打我妈 这样对吗干嘛这样 别必让酒牵鼻子落 瞎 让笑常色的 你想天有多够 如果我遇见
epoch 250,loss 0.272470,time 2.49 sec
- 分开 一只用 一步两步三步四步望著天 看星星 一颗两颗三颗四颗 连成线背著背默默许下心愿 看远方的星是否
- 不分开吗把的胖女巫 用拉丁文念咒语啦啦呜 她养在黑索不达米亚平 伤地一只饿昏的老言鸠 印地安老斑鸠 腿短毛


In [21]:
class RNN(nn.Module):
    def __init__(self,vocab_size,hidden_size):
        super(RNN,self).__init__()
        self.hidden_size=hidden_size
        self.vocab_size=vocab_size
        self.rnn=nn.RNN(vocab_size,hidden_size)
        self.fn=nn.Linear(hidden_size,vocab_size)
    def forward(self,inputs,state):#input:batch_size*seq_len;state:batch_size*hidden_size
        X=torch.stack(to_onehot(inputs,self.vocab_size))#X:seq_len,batch_size,vocab_size
        Y,state=self.rnn(X,state)
        output=self.fn(Y.view(-1,self.hidden_size))#seq_len*batch_size,vocab_size
        return output,state

In [77]:
def predict_rnn(prefix,num_chars,model,vocab_size,device,id2char,char2id):
    state = None
    output = [char2id[prefix[0]]] # output会记录prefix加上输出
    for t in range(num_chars + len(prefix) - 1):
        X = torch.tensor([output[-1]], device=device).view(1, 1)
        if state is not None:
            if isinstance(state,tuple):
                state=(state[0].to(device),state[1].to(device))
            else:
                state=state.to(device)
        Y,state=model(X,state)
        if t<len(prefix)-1:
            output.append(prefix[t+1])
        else:
            output.append(int(Y.argmax(dim=1).item()))
    return "".join([id2char[i] for i in output])

In [73]:
def predict_rnn_pytorch(prefix, num_chars, model, vocab_size, device, idx_to_char,
                      char_to_idx):
    state = None
    output = [char_to_idx[prefix[0]]] # output会记录prefix加上输出
    for t in range(num_chars + len(prefix) - 1):
        X = torch.tensor([output[-1]], device=device).view(1, 1)
        if state is not None:
            if isinstance(state, tuple): # LSTM, state:(h, c)  
                state = (state[0].to(device), state[1].to(device))
            else:   
                state = state.to(device)
            
        (Y, state) = model(X, state)
        if t < len(prefix) - 1:
            output.append(char_to_idx[prefix[t + 1]])
        else:
            output.append(int(Y.argmax(dim=1).item()))
    return ''.join([idx_to_char[i] for i in output])

In [79]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RNN(vocab_num, 256).to(device)
predict_rnn_pytorch('分开', 10, model, vocab_num, device, id2char, char2id)

'分开假间间假假间间假假间'

In [85]:
def train_predict_rnn(model,loss_fn,optimizer,epoch_size,corpus_index,batch_size,device,id2char,char2id,vocab_num):
    state=None
    for epoch in range(epoch_size):
        loss_sum,n=0.0,0
        for X,Y in data_iter_consecutive(corpus_index,batch_size=batch_size,num_steps=10,device=device):
            if state is not None:
                if isinstance(state,tuple):
                    state=(state[0].detach(),state[1].detach())
                else:
                    state=state.detach()
            Y_hat,state=model(X,state)
            Y=torch.transpose(Y,0,1).contiguous().view(-1)
            loss=loss_fn(Y_hat,Y.long())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_sum+=loss.item()*Y.shape[0]
            n+=Y.shape[0]
        print("epoch %d,loss:%f"%(epoch+1,loss_sum/n))
        print("--",predict_rnn_pytorch('分开', 10, model, vocab_num, device, id2char, char2id))

In [81]:
loss_fn=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.001)

In [88]:
train_predict_rnn(model,loss_fn,optimizer,5,corpus_index,64,device,id2char,char2id,vocab_num)

epoch 1,loss:3.947504
-- 分开 我不要的可爱女人 
epoch 2,loss:3.852210
-- 分开 我不了的可  我知
epoch 3,loss:3.689370
-- 分开 我不要的可爱女人 
epoch 4,loss:3.537514
-- 分开 我想你的可爱女人 
epoch 5,loss:3.396915
-- 分开 我不了的可 女人 
