# 语言模型
- 学习语言模型，以及如何训练一个语言模型
- 学习torchtext的基本使用方法
  - 构建vovabulary
  - word to index 和 index to word
- Linear 、 RNN/LSTM/GRU
- RNN GRADIENT CLIPPING
- 如何读取和保存模型

In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

random.seed(53113)
np.random.seed(53113)
if USE_CUDA:
    torch.cuda.manual_seed(53113)
    
BATCH_SIZE=32
EMBEDDING_SIZE = 100
MAX_VOCAB_SIZE = 50000

In [2]:
USE_CUDA

False

- TorchText的一个重要概念是Field，它决定了你的数据会如何被处理
- Field有个lower=True这个参数，所以所有单词都会被lowercase
- torchtext提供了LanguageModelingDataset这个class来帮助处理语言模型数据集
- build_vocab可以根据提供的训练数据集来穿件最高频单词的列表，max_size帮助我们限定单词总量
- BPTTIterator可以连续地得到连贯的句子，BPTT的全称是back propagation through time

In [3]:
TEXT = torchtext.data.Field(lower=True)
train,val,test = torchtext.datasets.LanguageModelingDataset.splits(path='.',train="text8.train.txt",
                                                  validation='text8.dev.txt',test='text8.test.txt',text_field = TEXT)



In [4]:
TEXT.build_vocab(train,max_size=MAX_VOCAB_SIZE)

In [5]:
len(TEXT.vocab)

50002

In [6]:
TEXT.vocab.itos[:10]

['<unk>', '<pad>', 'the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero']

- 为什么我们的单词表有 50002 个单词而不是 50000 呢？因为 Torchtext 给我们增加了两个特殊的  token, <unk》表示未知的单词，<pad》表示 padding
- 模型的输入是一串文字，模型的输出也是一串文字，他们之间相差一个位置，因为语言模型的目标是根据之前的单词预测下一个单词。

In [7]:
import gc
gc.collect()

66

In [8]:
device = torch.device("cuda" if USE_CUDA else "cpu")

train_iter, val_iter, test_iter = torchtext.data.BPTTIterator.splits((train,val,test),batch_size=BATCH_SIZE,device=device,bptt_len=50,repeat=False,shuffle=True)



In [9]:
it = iter(train_iter)
batch = next(it)
batch.text



tensor([[4815,   50,    6,  ..., 9116,   33,    7],
        [3143, 2748,  495,  ...,  893,  277,  317],
        [  13,    8,  850,  ...,  664,  824, 1602],
        ...,
        [   8,   34,  522,  ..., 5237,    3,   12],
        [3628, 1266,  968,  ...,    3,    2,    6],
        [   2,   54,   78,  ...,   12,  185, 3027]])

In [10]:
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,0].data.cpu()))
print()
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,0].data.cpu()))

anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the

originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans <unk> of the french revolution whilst the term is still used in a pejorative way to describe any act that used violent means to destroy the organization


## 定义模型
- 集成nn.Module
- 初始化函数
- forward函数
- 其余可以根据模型需要定义相关的函数

In [11]:

import torch
import torch.nn as nn


class RNNModel(nn.Module):
    """ 一个简单的循环神经网络"""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5):
        ''' 该模型包含以下几层:
            - 词嵌入层
            - 一个循环神经网络层(RNN, LSTM, GRU)
            - 一个线性层，从hidden state到输出单词表
            - 一个dropout层，用来做regularization
        '''
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, hidden):
        ''' Forward pass:
            - word embedding
            - 输入循环神经网络
            - 一个线性层从hidden state转化为输出单词表
        '''
        emb = self.drop(self.encoder(input))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad),
                    weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad))
        else:
            return weight.new_zeros((self.nlayers, bsz, self.nhid), requires_grad=requires_grad)

In [12]:
# 初始化模型
HIDDEN_SIZE =100
VOCAB_SIZE = len(TEXT.vocab)

model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)

if USE_CUDA:
    model = model.to(device)
    

## 训练模型
- 模型一般需需要训练若干epoch
- 每个epoch我们都把所有的数据分成若干个batch
- 把每个batch的输入和输出都包装成cuda tensor
- forward pass,通过输入的句子预测每个单词的下一个单词
- 用模型的预测和正确的下一个单词计算cross entropy loss
- 清空模型当前gradient
- backward pass
- gradient clipping ,防止梯度爆炸
- 更新模型参数
- 每隔一定的iteration 输出模型在当前iteration的loss，以及在验证数据集上做模型评估

In [13]:
def repackage_hidden(h):
    if isinstance(h,torch.Tensor):
        return h.detach() # 从计算图中做截断
    else:
        return tuple(repackage_hidden(v) for v in h)

In [14]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr = learning_rate)

scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,0.5) # learning rate降一半

In [15]:
def evaluate(model,data):
    model.eval()
    total_loss = 0.
    total_count = 0.
    it = iter(data)
    
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE,requires_grad=False) # 把当前的hidden weigt拿出来
        for i,batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data,hidden) # back propgate through all iter

            loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1)) # batch_size * target_class_dim, batch_size
            total_count += np.multiply(*data.size())
            total_loss += loss.item()*np.multiply(*data.size())
        
    loss = total_loss/total_count
    model.train()
    return loss

In [None]:
NUM_EPOCHS = 2
GRAD_CLIP=5.

val_losses = []

for epoch in range(NUM_EPOCHS):
    model.train()
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE) # 把当前的hidden weigt拿出来
    for i,batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data,hidden) # back propgate through all iter
        
        loss = loss_fn(output.view(-1,VOCAB_SIZE),target.view(-1)) # batch_size * target_class_dim, batch_size
        optimizer.zero_grad()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
        
        if i%100 == 0:
            print("epoch", epoch, "iter", i, "loss", loss.item())
            
        if i%1000==0:
            val_loss = evaluate(model,val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                print("best model, val loss: ", val_loss)
                torch.save(model.state_dict(),'lm.pth')
                print("best model saved to lm.pth")
            else: # 这里是每次loss不降就降learning rate  有的是三次不降
                # learning rate decay
                scheduler.step()
            val_losses.append(val_loss)
        
        

epoch 0 iter 0 loss 10.818486213684082
best model saved to lm.pth
epoch 0 iter 100 loss 10.820296287536621
epoch 0 iter 200 loss 10.821335792541504
epoch 0 iter 300 loss 10.81839370727539


In [None]:
best_model = RNNModel("LSTM", VOCAB_SIZE, EMBEDDING_SIZE, EMBEDDING_SIZE, 2, dropout=0.5)
if USE_CUDA:
    best_model = best_model.cuda()
best_model.load_state_dict(torch.load("lm.pth"))