## 2. 使用LSTM进行语言建模

<img src='img/rnn-lm.jpg' width=500>

reference:
- https://pytorch.org/tutorials/beginner/transformer_tutorial.html
- https://pytorch.org/tutorials/advanced/dynamic_quantization_tutorial.html?highlight=language%20modeling
- https://github.com/pytorch/examples/tree/master/word_language_model


### 设置超参数

一般，这些超参数是通过 argparse.ArgumentParser 从命令行中获取的。

In [1]:
import argparse

hparams = argparse.Namespace(**{
    'batch_size': 16,
    'learning_rate': 5,
    'max_grad_norm': 1.,
    'bptt': 32,  # sequence_length
    'dropout': 0.2,
    'embedding_dim': 200,
    'hidden_dim': 200,
    'n_layers': 4,
    'tie_weights': True,
    'seed': 42,
    'num_train_epochs': 20,
    'lm_data_dir': 'data/PennTreebank',
    'model_save_path': 'data/save_model/lstm_lm.path',
    'temperature': 1.
})

hparams

Namespace(batch_size=16, learning_rate=5, max_grad_norm=1.0, bptt=32, dropout=0.2, embedding_dim=200, hidden_dim=200, n_layers=4, tie_weights=True, seed=42, num_train_epochs=20, lm_data_dir='data/PennTreebank', model_save_path='data/save_model/lstm_lm.path', temperature=1.0)

### 加载数据

- 加载数据
- tokenize
- 建立词表，并将词映射为id
- 将数据打包到batch中

In [2]:
from PennTreebankCorpus import Corpus


corpus = Corpus(root_dir=hparams.lm_data_dir)
corpus.load_datasets()
train_data, val_data, test_data = corpus.preprocess(hparams.batch_size)

hparams.vocab_size = corpus.vocab_size()

hparams.vocab_size

1. Tokenization
---------------------
train: tokenizing ...
val: tokenizing ...
test: tokenizing ...
Done!


2. Build Vocab
Done!


3. To ids
Done!


4. batchify
Done!




9924

In [3]:
train_data.size()
# 加载词向量

torch.Size([60659, 16])

### 建立模型

In [4]:
import torch
from torch import nn

class LSTMModel(nn.Module):
    
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams    

        self.drop = nn.Dropout(hparams.dropout)
        self.embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)
        self.rnn_layer = nn.LSTM(
            hparams.embedding_dim,
            hparams.hidden_dim, 
            hparams.n_layers,
            dropout=hparams.dropout)

        self.decoder = nn.Linear(hparams.hidden_dim, hparams.vocab_size)
        if hparams.tie_weights:
            assert hparams.embedding_dim == hparams.hidden_dim
            self.decoder.weight = self.embedding.weight

        # 参数初始化
        self.init_weights()

    def init_weights(self):
        init_range = 0.1
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.zero_()
        self.decoder.weight.data.uniform_(-init_range, init_range)

    def forward(self, input, hidden):

        # (L, B) => (L, B, embedding_dim)
        emb = self.embedding(input)        
        emb = self.drop(emb)

        # emb (L, B, embedding_dim) => output (L, B, hidden_dim)
        output, hidden = self.rnn_layer(emb, hidden)
        output = self.drop(output)
        
        # (L, B, hidden_dim) => (L, B, vocab_size)
        # 为每个位置预测下个词
        decoded = self.decoder(output)
        return decoded, hidden

    def init_hidden(self, batch_size):
        """
        初始化 第一个隐状态和细胞状态
        """
        weight = next(self.parameters())
        return (weight.new_zeros(self.hparams.n_layers, batch_size, self.hparams.hidden_dim),
                weight.new_zeros(self.hparams.n_layers, batch_size, self.hparams.hidden_dim))


In [5]:
model = LSTMModel(hparams)
model.cuda()


LSTMModel(
  (drop): Dropout(p=0.2, inplace=False)
  (embedding): Embedding(9924, 200)
  (rnn_layer): LSTM(200, 200, num_layers=4, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=9924, bias=True)
)

In [6]:
loss_func = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=hparams.learning_rate, momentum=0.9)

### 训练

In [7]:
def get_batch(source, i, hparams):
    seq_len = min(hparams.bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]
    target = source[i+1:i+1+seq_len]
    return data, target


def repackage_hidden(h):
    """Wraps hidden states in new Tensors, to detach them from their history."""

    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(v) for v in h)

In [8]:
from tqdm import tqdm
import math

def train(model, train_data, loss_func, optimizer, epoch_idx, hparams):
    model.train()
    hidden = model.init_hidden(hparams.batch_size)
    
    pbar = tqdm(range(0, train_data.size(0)-1, hparams.bptt))
    pbar.set_description(f'Epoch {epoch_idx}')

    
    for i in pbar:
        # left_context: (L, B)
        # target: (L, B)
        left_context, targets = get_batch(train_data, i, hparams)

        left_context = left_context.cuda()
        targets = targets.cuda()

        output, hidden = model(left_context, hidden)
        
        optimizer.zero_grad()
        loss = loss_func(output.view(-1, hparams.vocab_size), targets.view(-1))
        loss.backward()

        # 梯度裁剪，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), hparams.max_grad_norm)
        optimizer.step()

        # 混淆度：评估语言模型的一个指标
        ppl = math.exp(loss.item())
        pbar.set_postfix(loss=loss.item(), ppl=ppl)

        # 中断梯度
        hidden = repackage_hidden(hidden)

In [9]:
def evaluate(model, test_val_data, loss_func, hparams):
    model.eval()
    hidden = model.init_hidden(hparams.batch_size)
    total_loss = 0.

    hidden = model.init_hidden(hparams.batch_size)
    with torch.no_grad():
        n_steps = 0
        pbar = tqdm(range(0, test_val_data.size(0)-1, hparams.bptt))
        pbar.set_description('Valid')
        for i in pbar:
            left_context, targets = get_batch(test_val_data, i, hparams)
            
            left_context = left_context.cuda()
            targets = targets.cuda()

            output, hidden = model(left_context, hidden)
            hidden = repackage_hidden(hidden)
            loss = loss_func(output.view(-1, hparams.vocab_size), targets.view(-1))
            total_loss += loss.item()

            n_steps += 1
    avg_loss = total_loss / n_steps
    ppl = math.exp(avg_loss)
    return avg_loss, ppl


In [10]:
best_val_loss = None
learning_rate = hparams.learning_rate

for epoch_idx in range(hparams.num_train_epochs):
    train(model, train_data, loss_func, optimizer, epoch_idx+1, hparams)
    val_loss, ppl = evaluate(model, val_data, loss_func, hparams)
    print(f'\r[Validation] loss: {val_loss:.4f}, PPL: {ppl:.0f}, LR: {learning_rate}                       ')
    
    if not best_val_loss or val_loss < best_val_loss:
        torch.save(model.state_dict(), hparams.model_save_path)
        print(F'save model to {hparams.model_save_path}\n')
        best_val_loss = val_loss
    else:
        for param_group in optimizer.param_groups:
            learning_rate /= 4
            param_group['lr'] = learning_rate


Epoch 1: 100%|██████████| 1896/1896 [01:24<00:00, 22.36it/s, loss=5.27, ppl=195]
Valid: 100%|██████████| 151/151 [00:01<00:00, 86.48it/s]
[Validation] loss: 5.0366, PPL: 154, LR: 5                       
save model to data/save_model/lstm_lm.path

Epoch 2: 100%|██████████| 1896/1896 [01:19<00:00, 23.82it/s, loss=4.97, ppl=144]
Valid: 100%|██████████| 151/151 [00:01<00:00, 84.41it/s]
[Validation] loss: 4.8388, PPL: 126, LR: 5                       
save model to data/save_model/lstm_lm.path

Epoch 3: 100%|██████████| 1896/1896 [01:23<00:00, 22.62it/s, loss=4.92, ppl=137]
Valid: 100%|██████████| 151/151 [00:01<00:00, 77.41it/s]
[Validation] loss: 4.7688, PPL: 118, LR: 5                       
save model to data/save_model/lstm_lm.path

Epoch 4: 100%|██████████| 1896/1896 [01:32<00:00, 20.54it/s, loss=4.84, ppl=126]
Valid: 100%|██████████| 151/151 [00:01<00:00, 76.74it/s]
[Validation] loss: 4.7285, PPL: 113, LR: 5                       
save model to data/save_model/lstm_lm.path

Epoch 5:

加载模型

In [13]:
model = LSTMModel(hparams)
model.load_state_dict(torch.load(hparams.model_save_path))
model.eval()

LSTMModel(
  (drop): Dropout(p=0.2, inplace=False)
  (embedding): Embedding(9924, 200)
  (rnn_layer): LSTM(200, 200, num_layers=4, dropout=0.2)
  (decoder): Linear(in_features=200, out_features=9924, bias=True)
)

In [21]:
torch.manual_seed(hparams.seed)
torch.cuda.manual_seed(hparams.seed)


input_sentence = ['<sos>', 'we', 'have', 'no', 'useful', 'information']

input_ = torch.tensor([corpus.vocab[word] for word in input_sentence])
input_ = input_.view(-1, 1)

hidden = model.init_hidden(1)

with torch.no_grad():
    for i in range(20):
        # (4, 1, vocab_size)
        output, hidden = model(input_, hidden)
        # (1, vocab_size)
        # 最后一个位置的预测结果
        output = output[-1]
        # (vocab_size)
        word_weights = output.squeeze().div(hparams.temperature).exp()
        # (10, )
        topk_word = word_weights.topk(k=10)[1]
        topk_word = [word_idx for word_idx in topk_word if word_idx not in (corpus.vocab['<eos>'], corpus.vocab['<unk>'])]

        print('[next_word] candidates:', end=' ')
        for word_idx in topk_word:
            word = corpus.vocab.itos[word_idx]
            print(f'{word}', end=' ')
        print()

        input_word = input()
        if input_word == '.':
            break
        word_idx = corpus.vocab[input_word]
        input_sentence.append(input_word)
        print(' '.join(input_sentence))
        # word_idx = topk_word[0]
        input_= torch.tensor([word_idx]).view(1, 1)

[next_word] candidates: in and to of on for but that with 
<sos> we have no useful information on
[next_word] candidates: the a their this n its an it our 
<sos> we have no useful information on this
[next_word] candidates: year of time country and week market month 
<sos> we have no useful information on this 
[next_word] candidates: and of in the or to a but 
<sos> we have no useful information on this  but
[next_word] candidates: it the they we i he that a in 
<sos> we have no useful information on this  but we
[next_word] candidates: ' can do have are need would were ca want 
<sos> we have no useful information on this  but we do
[next_word] candidates: n not in have and to they it 
<sos> we have no useful information on this  but we do have
[next_word] candidates: a to the n been in an no it 
<sos> we have no useful information on this  but we do have to
[next_word] candidates: be do see get have make go sell take 
<sos> we have no useful information on this  but we do have to see

In [27]:
torch.manual_seed(hparams.seed)
torch.cuda.manual_seed(hparams.seed)


# input_ = torch.randint(hparams.vocab_size, (1, 1), dtype=torch.long)

input_ = torch.tensor(corpus.vocab['<sos>'])
input_ = input_.view(-1, 1)
hidden = model.init_hidden(1)

word = corpus.vocab.itos[input_.item()]
print(word, end=' ')

with torch.no_grad():
    for i in range(100):
        output, hidden = model(input_, hidden)

        word_weights = output.squeeze().div(1).exp()
        word_weights[corpus.vocab['<unk>']] = 0.
        word_idx = torch.multinomial(word_weights, 1)[0]

        input_.fill_(word_idx)
        word = corpus.vocab.itos[word_idx]
        print(word, end=' ')
        if word == '<eos>':
            print()

<sos> <eos> 
<sos> it has ruled that no magazine saying might draw new professional for any part to big insurance <eos> 
<sos> a white house spokesman called the rules of congress with appropriations safety and milk system by gte ' s market <eos> 
<sos> such losses are to likely said stop estimated the japanese sides <eos> 
<sos> the increasing start in recent months are n ' t expected to join the united aid program <eos> 
<sos> congress was n ' t likely to kill approved general government confidence in the wake of a federal bank <eos> 
<sos> the new jersey meeting 

### 拓展

1. LSTM得到的句子表示本身就可以用于下游任务
1. LSTM的并行性不够友好，难以用于大规模语料上的训练 => Transformer
2. 文本生成的解码策略：Greedy Search vs Beam Search

refernece: https://huggingface.co/blog/how-to-generate