# Seq2seq进行机器翻译与存取模型

当前最复杂的NLP应用之一就是机器翻译，我们今天来讲一个使用Seq2seq模型进行机器翻译的例子，顺便给大家介绍一下存取模型。

In [1]:
import torch
import numpy as np
from torch import nn

from models.Seq2seqTranslate import Seq2seq_translater
from dataset_readers.trans import *
from utils.tokenizer import Tokenizer

我们本次就不用emb啦，因为我们今天的任务是英语翻译成法语，而英语的emb处理起来要花的时间太多了，就直接初始化吧。

首先我们来建立一个Seq2seq的网络。一般的Seq2seq网络都主要分成两个部分：Encoder和Decoder。

In [3]:
# 之前我们用的一些无参数的函数其实在这个大F里都有，我们可以不用预先初始化，直接使用
import torch.nn.functional as F

# Encoder部分，其实就是一个标准的RNN网络
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        # 词嵌入层，这里没有初始化，就让它随着训练自己计算吧
        self.embedding = nn.Embedding(input_size, hidden_size)
        # RNN层，使用了GRU单元，输入是(batch_size, seq_length, hidden_size),输出一样
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.gru(embedded)
        # hidden用不到
        return output

# Decoder部分，带Attention机制
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=128):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        # 
        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size, batch_first=True)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, x, encoder_outputs):
        embedded = self.embedding(x)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(self.attn(embedded), dim=2)
        attn_applied = torch.bmm(attn_weights,
                                 encoder_outputs)

        output = torch.cat((embedded, attn_applied), dim=2)
        output = self.attn_combine(output)

        output = F.relu(output)
        output, hidden = self.gru(output)

        output = self.out(output)
        
        return output, hidden, attn_weights


class Seq2seq_translater(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, max_seq_length):
        super(Seq2seq_translater, self).__init__()
        self.encoder = EncoderRNN(input_size, hidden_size)
        self.decoder = AttnDecoderRNN(hidden_size, output_size, max_length=max_seq_length)
        self.loss_fct = nn.CrossEntropyLoss()
        self.output_size = output_size

    def forward(self, x, y=None):
        m = self.encoder(x)
        res, _, _ = self.decoder(x, m)

        if y is not None:
            return self.loss_fct(res.view(-1, self.output_size), y.view(-1))
        else:
            return F.softmax(res, dim=2)

In [4]:
# 我们做的是左传的词性标注，序列最长设为20
seq_length = 20
# 我们使用BIO标签，O->B，因此还是只有两类
label_len = 2
model = LSTMTaggerNet(seq_length, label_len, bidirectional=True)
# 使用print可以打印出网络的结构
print(model)

total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(str(total_trainable_params), 'parameters is trainable.')

if torch.cuda.is_available():
    model.to(torch.device('cuda'))

LSTMTaggerNet(
  (emb): Embedding(9110, 300)
  (lstm): LSTM(300, 300, bidirectional=True)
  (FC_out): Sequential(
    (0): Linear(in_features=600, out_features=50, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=50, out_features=2, bias=True)
  )
  (softmax): Softmax(dim=-1)
  (loss_fct): CrossEntropyLoss()
)
1474952 parameters is trainable.


同样的，这里是已经封装好了的`dataset_readers`包，用于存放读取数据的类。之前我们都是只跑训练没有跑eval，现在开始我们的数据都会进行“train/dev”划分，根据测试集上的表现来确定表现，这也是一般的NN工程的方法。

In [5]:
from dataset_readers.single_sent_clf import *
from dataset_readers.cws import *

# 这个类是用于读取左传数据的
reader = Zuozhuan_Cws()
# 获取训练集
train_examples = reader.get_train_examples()
# 获取开发集
dev_examples = reader.get_dev_examples()
for i in range(3):
    print(train_examples[i].text, train_examples[i].label)
for i in range(3):
    print(dev_examples[i].text, dev_examples[i].label)

春秋左传定公 ['B', 'I', 'B', 'I', 'B', 'I']
元年 ['B', 'I']
春 ['B']
春秋左传隐公 ['B', 'I', 'B', 'I', 'B', 'I']
惠公元妃孟子 ['B', 'I', 'B', 'I', 'B', 'I']
孟子卒 ['B', 'I', 'B']


还是一样生成dataloader，只不过这次有两个，一个是train，一个是dev。

In [6]:
from torch.utils.data import TensorDataset, DataLoader

def convert_example_to_feature(examples, tokenizer, seq_length):
    features = []
    for i in examples:
        # 使用tokenizer将字符串转换为数字id
        ids = tokenizer.tokens_to_ids(i.text)
        label = i.label
        # 我们规定了最大长度，超过了就切断
        if len(ids) > seq_length:
            ids = ids[0: seq_length]
            label = label[0: seq_length]
        # 如果这个字符串全都不能识别，那就放弃掉
        if sum(ids) == 0:
            continue
        # 处理标签，我们设B为1，I为0
        trans = {'B': 1, 'I': 0}
        label_ids = [trans[l] for l in i.label]
        
        padding = [0] * (seq_length - len(ids))
        
        ids += padding
        label_ids += padding
                
        assert len(ids) == seq_length
        assert len(label_ids) == seq_length
        features.append(data_feature(ids, label_ids))
    return features

def generate_dataloader(examples, tokenizer, seq_length):
    features = convert_example_to_feature(examples, tokenizer, seq_length)
    ids = torch.tensor([f.ids for f in features], dtype=torch.long)
    label = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(ids, label)
    dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
    return dataloader

train_dataloader = generate_dataloader(train_examples, tokenizer, seq_length)
dev_dataloader = generate_dataloader(dev_examples, tokenizer, seq_length)

依然使用Adam优化器。

In [7]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.0001)
print(optimizer)

Adam (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    eps: 1e-08
    lr: 0.0001
    weight_decay: 0
)


开始训练，我们这里使用sklearn提供的评估方法来进行评估。

In [8]:
# 可以直接算p，r，f1
from sklearn.metrics import precision_score, recall_score, f1_score

epoch = 10
for i in range(epoch):
    model.train()
    total_loss = []
    for ids, label_ids in train_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
            label_ids = label_ids.to(torch.device('cuda'))
        optimizer.zero_grad()
        loss = model(ids, label_ids)
        total_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    print("epoch: %d, loss: %.6f" % (i + 1, sum(total_loss) / len(total_loss)))
    
    model.eval()
    total_gold = []
    total_pred = []
    for ids, label_ids in dev_dataloader:
        if torch.cuda.is_available():
            ids = ids.to(torch.device('cuda'))
        # 进行预测（batch_size, seq_length, label_len）
        logits = model(ids)
        # 转成numpy
        logits = logits.detach().cpu().numpy()
        # 从预测的概率中找到最大的，输出下标
        logits = np.argmax(logits, axis=-1)
        # 转成list
        logits = logits.tolist()
        # 插入到“总预测”的最后
        total_pred.extend(logits)
        # 将真实标签也插入到“总真实”的最后
        label_ids = label_ids.view(-1).numpy().tolist()
        total_gold.extend(label_ids)
    # eval_p = precision_score(total_gold, total_pred)
    # eval_r = recall_score(total_gold, total_pred)
    eval_f1 = f1_score(total_gold, total_pred)
    print("eval_f1: %.2f%%" % (eval_f1 * 100))

epoch: 1, loss: 0.176529
eval_f1: 91.93%
epoch: 2, loss: 0.068826
eval_f1: 93.50%
epoch: 3, loss: 0.055466
eval_f1: 93.87%
epoch: 4, loss: 0.047732
eval_f1: 93.85%
epoch: 5, loss: 0.043406
eval_f1: 94.16%
epoch: 6, loss: 0.040075
eval_f1: 93.70%
epoch: 7, loss: 0.037694
eval_f1: 94.35%
epoch: 8, loss: 0.035436
eval_f1: 93.63%
epoch: 9, loss: 0.033188
eval_f1: 94.09%
epoch: 10, loss: 0.031708
eval_f1: 94.14%
