In [1]:
sentences = [
    ["咖哥 很 喜欢 小冰", "KaGe likes XiaoBing much"],
    ["我 爱 学习 人工智能", "I love studying AI"],
    ["深度学习 改变 世界", "DL changed the world"],
    ["自然语言 处理 很 强大", "NLP is so powerful"],
    ["神经网络 非常 复杂", "Neural Nets are complex"]
]

word_list_cn, word_list_en = [], ["<sos>", "<eos>"]
for s in sentences:
    word_list_cn.extend(s[0].split())
    word_list_en.extend(s[1].split())

word_list_cn = list(set(word_list_cn))
word_list_en = list(set(word_list_en))
# 单词->idx
word2idx_cn = {w: i for i, w in enumerate(word_list_cn)}
word2idx_en = {w: i for i, w in enumerate(word_list_en)}
# idx->单词
idx2word_cn = {i: w for i, w in enumerate(word_list_cn)}
idx2word_en = {i: w for i, w in enumerate(word_list_en)}

voc_size_cn = len(word2idx_cn)
voc_size_en = len(word2idx_en)
print("句子数量", len(sentences))
print("中文词表", len(word_list_cn))
print("英文词表", len(word_list_en))
print("中文词表->idx：", word2idx_cn)
print("英文词表->idx：", word2idx_en)

句子数量 5
中文词表 17
英文词表 22
中文词表->idx： {'自然语言': 0, '喜欢': 1, '处理': 2, '改变': 3, '很': 4, '小冰': 5, '世界': 6, '咖哥': 7, '非常': 8, '复杂': 9, '学习': 10, '我': 11, '神经网络': 12, '强大': 13, '爱': 14, '深度学习': 15, '人工智能': 16}
英文词表->idx： {'XiaoBing': 0, 'likes': 1, 'powerful': 2, 'the': 3, 'NLP': 4, '<sos>': 5, 'is': 6, 'Neural': 7, 'complex': 8, 'KaGe': 9, 'AI': 10, 'DL': 11, 'are': 12, 'studying': 13, 'so': 14, '<eos>': 15, 'world': 16, 'much': 17, 'I': 18, 'love': 19, 'changed': 20, 'Nets': 21}


In [2]:
# import numpy as np
import torch
import random

def make_data(sentences):
    sentence = random.choice(sentences)
    encode_input = [[word2idx_cn[word] for word in sentence[0].split()]]
    decoder_input = [[word2idx_en[word] for word in ("<sos> "+ sentence[1]).split()]]
    target = [[word2idx_en[word] for word in (sentence[1] + " <eos>").split()]]

    encode_input = torch.LongTensor(encode_input)
    decoder_input = torch.LongTensor(decoder_input)
    target = torch.LongTensor(target)
    return sentence, encode_input, decoder_input, target

sentence, encoder_input, decoder_input, target = make_data(sentences)
print("原始句子", sentence)
print(f"encoder_input: {encoder_input.shape} {encoder_input}")
print(f"decoder_input: {decoder_input.shape} {decoder_input}", )
print(f"target: {target.shape} {target}")

原始句子 ['自然语言 处理 很 强大', 'NLP is so powerful']
encoder_input: torch.Size([1, 4]) tensor([[ 0,  2,  4, 13]])
decoder_input: torch.Size([1, 5]) tensor([[ 5,  4,  6, 14,  2]])
target: torch.Size([1, 5]) tensor([[ 4,  6, 14,  2, 15]])


In [3]:
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True) 
    
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs) # 输入转embedding
        output, hidden = self.rnn(embedded, hidden)
        return output, hidden

class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs, hidden):
        embedded = self.embedding(inputs) # 输入转embedding
        output, hidden = self.rnn(embedded, hidden)
        output = self.out(output)
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, encoder_inputs, hidden, decoder_inputs):
        encoder_outputs, encoder_hidden = self.encoder(encoder_inputs, hidden)
        decoder_hidden = encoder_hidden
        decoder_outputs, _ = self.decoder(decoder_inputs, decoder_hidden)
        return decoder_outputs



In [4]:
def train_seq2seq(model, loss_fn, optimizer, epoches, debug=False):
    for epoch in range(epoches):
        sentence, encoder_input, decoder_input, decoder_target = make_data(sentences)
        hidden = torch.zeros(1, encoder_input.size(0), n_hidden) # 初始化隐藏层状态
        if debug:
            print("sentence", sentence)
        optimizer.zero_grad()
        output = model(encoder_input, hidden, decoder_input)
        # if debug:
        #     print("output", output.view(-1, voc_size_en).shape)
        #     print("target", target.view(-1).shape)
        loss = loss_fn(output.view(-1, voc_size_en), target.view(-1))
        if (epoch + 1) % 100 == 0:
            print(f"epoch {epoch + 1:04d} loss: {loss.item():.06f}")
        loss.backward()
        optimizer.step()

In [5]:
n_hidden = 128
encoder = Encoder(voc_size_cn, n_hidden)
decoder = Decoder(n_hidden, voc_size_en)
print("encode:", encoder)
print("decode:", decoder)
model = Seq2Seq(encoder, decoder)
print("seq2seq", model)
epoches = 1000
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
train_seq2seq(model, loss_fn, optimizer, epoches, False)

encode: Encoder(
  (embedding): Embedding(17, 128)
  (rnn): RNN(128, 128, batch_first=True)
)
decode: Decoder(
  (embedding): Embedding(22, 128)
  (rnn): RNN(128, 128, batch_first=True)
  (out): Linear(in_features=128, out_features=22, bias=True)
)
seq2seq Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(17, 128)
    (rnn): RNN(128, 128, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(22, 128)
    (rnn): RNN(128, 128, batch_first=True)
    (out): Linear(in_features=128, out_features=22, bias=True)
  )
)
epoch 0100 loss: 0.013292
epoch 0200 loss: 0.006041
epoch 0300 loss: 0.003218
epoch 0400 loss: 0.002147
epoch 0500 loss: 0.001396
epoch 0600 loss: 0.001132
epoch 0700 loss: 0.000824
epoch 0800 loss: 0.000704
epoch 0900 loss: 0.000547
epoch 1000 loss: 0.000473


In [16]:
def test_seq2seq(model, source_sentence, debug=False):
    encoder_input = [[word2idx_cn[word] for word in source_sentence.split()]]
    # if debug:
    #     print("encoder_input.shape", len(encoder_input[0]))
    #     print("encode_input", encoder_input.size(0))
    decoder_input = [word2idx_en['<sos>']] + [word2idx_en['<eos>']]*(len(encoder_input[0])-1)

    encoder_input = torch.LongTensor(encoder_input)
    decoder_input = torch.LongTensor(decoder_input).unsqueeze(0)
    # if debug:
    #     print("encoder_input.shape", encoder_input.shape)
    #     print("decoder_input.shape", decoder_input.shape)
    hidden = torch.zeros(1, encoder_input.size(0), n_hidden)
    predict = model(encoder_input, hidden, decoder_input)
    if debug:
        print("predict.shape", predict.shape)
        print("predict", predict)
    predict = predict.data.max(2, keepdim=True)[1]
    if debug:
        print("predict.shape", predict.shape)
        print("predict", predict)
    print(f"'{source_sentence}' ->", "'" + " ".join([idx2word_en[n.item()] for n in predict.squeeze()]) + "'") 

model.eval()
test_seq2seq(model, "咖哥 喜欢 小冰", False)
test_seq2seq(model, "神经网络 非常 复杂", False)

'咖哥 喜欢 小冰' -> 'NLP is so'
'神经网络 非常 复杂' -> 'NLP is so'
