语言模型小实验（
------
- 使用torchtext初体验
- nn.LSTM

In [1]:
import torchtext
from torchtext.vocab import Vectors
import torch
import numpy as np
import random

USE_CUDA = torch.cuda.is_available()

random.seed(2021)
np.random.seed(2021)
torch.manual_seed(2021)
if USE_CUDA:
    torch.cuda.manual_seed(2021)
    
BATCH_SIZE = 32
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
MAX_VOCAB_SIZE = 50000

In [2]:
TEXT = torchtext.legacy.data.Field(lower=True) # Field相当于一个预处理工具
train, val, test = torchtext.legacy.datasets.LanguageModelingDataset.splits(path=".",
                                                 train="text8.train.txt",
                                                 validation="text8.train.txt",
                                                 test="text8.train.txt",
                                                 text_field=TEXT)

In [3]:
TEXT.build_vocab(train, max_size=MAX_VOCAB_SIZE)

In [4]:
device = torch.device("cuda" if USE_CUDA else "cpu")

In [5]:
train_iter, val_iter, test_iter = torchtext.legacy.data.BPTTIterator.splits((train, val, test),
                                  batch_size=BATCH_SIZE,
                                  device=device,
                                  bptt_len=50,
                                  shuffle=True)

In [6]:
it = iter(train_iter)
batch = next(it)
batch


[torchtext.legacy.data.batch.Batch of size 32]
	[.text]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]
	[.target]:[torch.cuda.LongTensor of size 50x32 (GPU 0)]

In [8]:
print(" ".join(TEXT.vocab.itos[i] for i in batch.text[:,31].data.cpu()))
print()
print(" ".join(TEXT.vocab.itos[i] for i in batch.target[:,31].data.cpu()))

lycaon in greek mythology was a son of priam and <unk> during the trojan war lycaon was captured by achilles while cutting branches in priam s orchard achilles sold him as a slave to <unk> of lemnos but <unk> of <unk> bought him and took him back to troy only

in greek mythology was a son of priam and <unk> during the trojan war lycaon was captured by achilles while cutting branches in priam s orchard achilles sold him as a slave to <unk> of lemnos but <unk> of <unk> bought him and took him back to troy only twelve


In [9]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(embed_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, text, hidden):
        # text:[seq_length, batch_size]
        emb = self.embed(text)
        # emb:[seq_length, batch_size, embed_size]
        output, hidden = self.lstm(emb, hidden)
        # output:[seq_length, batch_size, hidden_size]
        # hidden:[[1, batch_size, hidden_size], [1, batch_size, hidden_size]]
        decoded = self.decoder(output.view(-1, output.shape[2]))
        # decoded:[(seq_length*batch_size), vocab_size]
        decoded = decoded.view(output.size(0), output.size(1), decoded.size(-1))
        # decoded:[seq_length, batch_size, vocab_size]
        return decoded, hidden
    
    def init_hidden(self, bsz, requires_grad=True):
        weight = next(self.parameters())
        return (weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True),
               weight.new_zeros((1, bsz, self.hidden_size), requires_grad=True))

In [10]:
model = LSTMModel(vocab_size=len(TEXT.vocab), embed_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE)

if USE_CUDA:
    model = model.to(device)

In [11]:
model

LSTMModel(
  (embed): Embedding(50002, 100)
  (lstm): LSTM(100, 100)
  (decoder): Linear(in_features=100, out_features=50002, bias=True)
)

训练模型
----

In [12]:
# 重新打包，把值复制过来，但不要计算图的历史，相当于bp有了新起点
def repackage_hidden(h):
    if isinstance(h, torch.Tensor):
        return h.detach()
    else:
        return tuple(repackage_hidden(t) for t in h)

In [21]:
def evaluate(model, val_iter):
    model.eval() # model.eval()不启用BN和Dropout
    total_loss = 0.
    total_count = 0.
    
    it = iter(val_iter)
    with torch.no_grad():
        hidden = model.init_hidden(BATCH_SIZE, requires_grad=False)
        for i, batch in enumerate(it):
            data, target = batch.text, batch.target
            hidden = repackage_hidden(hidden)
            output, hidden = model(data, hidden) # bp all the times, use detach to cut off bp in some branches

            loss = loss_fn(output.view(-1, len(TEXT.vocab)), target.view(-1))
            total_loss += loss.item() * np.multiply(*data.size())
            total_count += np.multiply(*data.size())
    
    loss = total_loss / total_count
    model.train()
    return loss

In [16]:
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5) # lr decay

In [22]:
NUM_EPOCHS = 2
GRAD_CLIP = 5.0

val_losses = []

for epochs in range(NUM_EPOCHS):
    model.train() # model.train()能够启用BN和Dropout
    it = iter(train_iter)
    hidden = model.init_hidden(BATCH_SIZE)
    for i, batch in enumerate(it):
        data, target = batch.text, batch.target
        hidden = repackage_hidden(hidden)
        output, hidden = model(data, hidden) # bp all the times, use detach to cut off bp in some branches
        
        loss = loss_fn(output.view(-1, len(TEXT.vocab)), target.view(-1))
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRAD_CLIP) # 梯度裁切
        optimizer.step()
        if i % 100 == 0:
            print("epoch", epochs, "iteration", i, "loss", loss.item())
            
        if i % 1000 == 0:
            val_loss = evaluate(model, val_iter)
            if len(val_losses) == 0 or val_loss < min(val_losses):
                torch.save(model.state_dict(), "lm.pth")
                print("best model saved to lm.pth, val_loss=", val_loss)
            else:
                # lr decay
                scheduler.step()
            val_losses.append(val_loss)

epoch 0 iteration 0 loss 10.794493675231934
best model saved to lm.pth, val_loss= 10.811535831403834
epoch 0 iteration 100 loss 7.261686325073242
epoch 0 iteration 200 loss 7.008199691772461
epoch 0 iteration 300 loss 7.037992000579834
epoch 0 iteration 400 loss 7.0093536376953125
epoch 0 iteration 500 loss 6.877022743225098
epoch 0 iteration 600 loss 6.638118743896484
epoch 0 iteration 700 loss 6.646942615509033
epoch 0 iteration 800 loss 6.741439342498779
epoch 0 iteration 900 loss 6.606591701507568
epoch 0 iteration 1000 loss 6.38194465637207
best model saved to lm.pth, val_loss= 6.678328329912397
epoch 0 iteration 1100 loss 6.816585063934326
epoch 0 iteration 1200 loss 6.3791184425354
epoch 0 iteration 1300 loss 6.575960159301758
epoch 0 iteration 1400 loss 6.609281539916992
epoch 0 iteration 1500 loss 6.168779373168945
epoch 0 iteration 1600 loss 6.3976898193359375
epoch 0 iteration 1700 loss 6.310326099395752
epoch 0 iteration 1800 loss 6.509497165679932
epoch 0 iteration 1900 lo

epoch 1 iteration 6200 loss 5.68695068359375
epoch 1 iteration 6300 loss 5.757584095001221
epoch 1 iteration 6400 loss 5.853007793426514
epoch 1 iteration 6500 loss 5.362195014953613
epoch 1 iteration 6600 loss 5.741400718688965
epoch 1 iteration 6700 loss 5.702823638916016
epoch 1 iteration 6800 loss 5.3430328369140625
epoch 1 iteration 6900 loss 5.226779937744141
epoch 1 iteration 7000 loss 5.514254093170166
best model saved to lm.pth, val_loss= 5.5073272079715565
epoch 1 iteration 7100 loss 5.506911754608154
epoch 1 iteration 7200 loss 5.317791938781738
epoch 1 iteration 7300 loss 5.584412097930908
epoch 1 iteration 7400 loss 5.51431941986084
epoch 1 iteration 7500 loss 5.299510478973389
epoch 1 iteration 7600 loss 5.710257530212402
epoch 1 iteration 7700 loss 5.506960868835449
epoch 1 iteration 7800 loss 5.25291633605957
epoch 1 iteration 7900 loss 5.331318378448486
epoch 1 iteration 8000 loss 5.390519618988037
best model saved to lm.pth, val_loss= 5.479782944949678
epoch 1 iterati

In [23]:
val_loss = evaluate(model, val_iter)
print(val_loss)

5.44903101332044


In [24]:
torch.save(model.state_dict(), "lm.pth")

In [25]:
best_model = LSTMModel(vocab_size=len(TEXT.vocab), embed_size=EMBEDDING_SIZE, hidden_size=HIDDEN_SIZE)

if USE_CUDA:
    model = model.to(device)
best_model.load_state_dict(torch.load("lm.pth"))

<All keys matched successfully>