In [1]:
# nplm (Neural Probabilistic Language Model 神经概率语言模型)

sentences = ["我 喜欢 玩具", "我 爱 爸爸", "我 讨厌 挨打"]
word_list = list(set(" ".join(sentences).split()))

word_to_idx = {word: idx for idx, word in enumerate(word_list)}
idx_to_word = {idx: word for idx, word in enumerate(word_list)}
voc_size = len(word_list)
print("词汇表", word_list)
print("词汇到索引", word_to_idx)
print("索引到词汇", idx_to_word)
print("词汇表大小", len(word_list))

词汇表 ['我', '玩具', '挨打', '爸爸', '讨厌', '爱', '喜欢']
词汇到索引 {'我': 0, '玩具': 1, '挨打': 2, '爸爸': 3, '讨厌': 4, '爱': 5, '喜欢': 6}
索引到词汇 {0: '我', 1: '玩具', 2: '挨打', 3: '爸爸', 4: '讨厌', 5: '爱', 6: '喜欢'}
词汇表大小 7


In [2]:
import torch 
import random

batch_size = 2
def make_batch(sentences, batch_size):
    input_batch = []
    target_batch = []
    selected_sentences = random.sample(sentences, batch_size) # 随机选择batch_size个句子
    for sentence in selected_sentences:
        words = sentence.split()
        inp = [word_to_idx[word] for word in words[:-1]] # 输入是除了最后一个词的所有词
        tgt = word_to_idx[words[-1]] # 目标是最后一个词
        input_batch.append(inp)
        target_batch.append(tgt)
    input_batch = torch.LongTensor(input_batch)
    # print("input.shape", input_batch.shape)
    target_batch = torch.LongTensor(target_batch)
    # print("target.shape", target_batch.shape)
    return input_batch, target_batch

input_batch, target_batch = make_batch(sentences, batch_size)
print("输入批处理数据", input_batch)
input_words = []
for input_idxs in input_batch:
    input_words.append([idx_to_word[idx.item()] for idx in input_idxs])
print("输入批处理数据对应原始词", input_words)

print("目标批处理数据", target_batch)
target_words = [idx_to_word[idx.item()] for idx in target_batch]
print("目标批处理数据对应原始词", target_words)

输入批处理数据 tensor([[0, 5],
        [0, 4]])
输入批处理数据对应原始词 [['我', '爱'], ['我', '讨厌']]
目标批处理数据 tensor([3, 2])
目标批处理数据对应原始词 ['爸爸', '挨打']


In [3]:
import torch.nn as nn

# 模型用来预测给定句子的下一个词
class NPLM(nn.Module):
    def __init__(self, voc_size, embedding_size, n_step, n_hidden):
        super(NPLM, self).__init__()
        # 从词汇表大小到嵌入层大小的线性层
        self.C = nn.Embedding(voc_size, embedding_size)
        self.linear1 = nn.Linear(n_step * embedding_size, n_hidden)
        self.linear2 = nn.Linear(n_hidden, voc_size)
    
    def forward(self, X): # X: [batch_size, n_step]
        X = self.C(X) # [batch_size, n_step, embedding_size]
        X = X.view(-1, n_step * embedding_size) # [batch_size, n_step * embedding_size]
        hidden = torch.tanh(self.linear1(X)) # [batch_size, n_hidden]
        output = self.linear2(hidden) # [batch_size, voc_size]
        return output
    

In [4]:
n_step = 2
n_hidden = 2
embedding_size = 2
model = NPLM(voc_size, embedding_size, n_step, n_hidden)
print("NPLM 模型结构", model)

NPLM 模型结构 NPLM(
  (C): Embedding(7, 2)
  (linear1): Linear(in_features=4, out_features=2, bias=True)
  (linear2): Linear(in_features=2, out_features=7, bias=True)
)


In [5]:
import torch.optim as optim

loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.1)

epoches = 5000
for epoch in range(epoches):
    optimizer.zero_grad()
    input_batch, target_batch = make_batch(sentences, batch_size)
    pred = model(input_batch)
    loss = loss_fn(pred, target_batch)
    if (epoch + 1) % 1000 == 0:
        print(f"epoch {epoch + 1} loss: {loss.item():.4f}")
    loss.backward()
    optimizer.step()

epoch 1000 loss: 0.0031
epoch 2000 loss: 0.0005
epoch 3000 loss: 0.0003
epoch 4000 loss: 0.0001
epoch 5000 loss: 0.0001


In [21]:
input_strs = [["我", "讨厌"], ["我", "喜欢"]]
input_idxs = [[word_to_idx[word] for word in input_str] for input_str in input_strs]

input_batch = torch.LongTensor(input_idxs)
# 预测最大的idx
print(model(input_batch).data.shape)
# print(model(input_batch).data.max(dim=1))
predict = model(input_batch).data.max(dim=1).indices # [batch_size, 1]
print("predict.shape", predict.shape)
# print("predict.sequeeze.shape", predict.squeeze().shape)
predict_strs = [idx_to_word[idx.item()] for idx in predict]

for input_seq, pred in zip(input_strs, predict_strs):
    print(input_seq, "->", pred)

torch.Size([2, 7])
predict.shape torch.Size([2])
['我', '讨厌'] -> 挨打
['我', '喜欢'] -> 玩具
