In [None]:
from argparse import Namespace # 管理所有超参数
from collections import Counter # 统计词语数量
import string # 标点符号调用
import re # 正则表达式
import os # 生成文件路径
import json # 保存模型为JSON格式
import numpy as np # 数据处理
import pandas as pd # 文本处理
import torch # 调用PyTorch库
import torch.nn as nn # 调用神经网络层
import torch.nn.functional as F # 调用激活函数
import torch.optim as optim # 调用优化器
from torch.utils.data import Dataset, DataLoader # 调用批生成器
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence # 处理变长数据

In [None]:
args = Namespace(
    nmt_csv="cmn_data", # 数据集
    vectorizer_file="news_folder/vectorizer2.json", # 向量化器保存的位置
    model_state_file="news_folder/model2.pth", # 模型保存的位置
    predicted_file="news_folder/predicted2.csv", # 预测文件的位置
    char_embedding_size=100, # 字向量维度大小
    rnn_hidden_size=64, # RNN的隐藏层大小
    num_epochs=20, # 模型训练轮数
    learning_rate=5e-4, # 学习率
    batch_size=32, # 批的大小
    seed=1337, # 设置种子
    early_stopping_criteria=3, # 超过未优化次数将停止训练
    sampling=0.5, # 用CPU训练时取样50%数据用于训练
    source_embedding_size=24, # 需要翻译词嵌入大小
    target_embedding_size=24, # 翻译结果词嵌入大小
    encoding_size=32 # 编码器隐藏向量大小
)

In [None]:
class Vocabulary(object):
    
    '''创建一个词典类来管理数据集中每个词和对应索引的关系'''
    
    def __init__(self, token_to_idx={}):
        '''
        Args:
            token_to_idx: 载入预先生成好的词典，若没有会自动生成空词典
        '''
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
    
    # 向双向词典中加入令牌，并返回令牌在词典中所在的索引，若令牌已存在，直接返回索引
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    # 查找令牌在词典中的对应索引
    def lookup_token(self, token):
        return self._token_to_idx[token]

    # 查找索引在词典中对应的令牌，若索引不存在将报错
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    # 通过使用contents(序列化后的初始化信息)重建实例
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    # Print打印实例的输出结果
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    # 定义实例的长度信息为词典的长度
    def __len__(self):
        return len(self._token_to_idx)
    

In [None]:
class SequenceVocabulary(Vocabulary):
    
    '''创建一个词典类来管理数据集中每个词和对应索引的关系'''

    def __init__(self, token_to_idx={}, unk_token='<UNK>', mask_token='<MASK>',
                 begin_token='<BEGIN>', end_token='<END>'):
        '''
        Args:
            token_to_idx: 载入预先生成好的词典，若没有会自动生成空词典
            unk_token，mask_token，begin_token, end_token: 文本中的特殊令牌
        '''
        super(SequenceVocabulary, self).__init__(token_to_idx)
        
        # 保存特殊的令牌
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_token = begin_token
        self._end_token = end_token
        
        # 将特殊令牌添加到词典中，并保存对应的索引
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_index = self.add_token(self._begin_token)
        self.end_index = self.add_token(self._end_token)

    # 查找令牌在词典中对应的索引，如果令牌不存在，则返回UNK索引
    def lookup_token(self, token):
        return self._token_to_idx.get(token, self.unk_index)
    
    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_token': self._begin_token,
                         'end_token': self._end_token})
        return contents
    

In [None]:
class NMTVectorizer(object):
    
    '''创建一个向量化器类将文本句子转换为句子索引向量'''
    
    def __init__(self, source_vocab, target_vocab, max_source_length, max_target_length):
        """
        Args:
            source_vocab: 包含数据集中所有需要翻译的文本词典
            target_vocab: 包含数据集中所有翻译结果的文本词典
            max_source_length: 需要翻译的文本词典中词的最大长度
            max_target_length: 翻译结果的文本词典中词的最大长度
        """
        self.source_vocab = source_vocab
        self.target_vocab = target_vocab
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length
    
    # 基本的句子向量化过程
    def _vectorize(self, indices, vector_length=-1, mask_index=0):
        if vector_length < 0:
            vector_length = len(indices)
        vector = np.zeros(vector_length, dtype=np.int64)
        vector[:len(indices)] = indices
        vector[len(indices):] = mask_index
        return vector
    
    # 从文本中获取需要翻译的句子索引
    def _get_source_indices(self, text):
        indices = [self.source_vocab.begin_index]
        indices.extend(self.source_vocab.lookup_token(token) for token in text.split(" "))
        indices.append(self.source_vocab.end_index)
        return indices
    
    # 从文本中获取翻译结果的句子索引（序列模型的输入和输出）
    def _get_target_indices(self, text):
        indices = [self.target_vocab.lookup_token(token) for token in text.split(" ")]
        x_indices = [self.target_vocab.begin_index] + indices
        y_indices = indices + [self.target_vocab.end_index]
        return x_indices, y_indices
        
    # 向量化文本句子，将句子中的每个单词用索引表示，生成句子索引向量
    def vectorize(self, source_text, target_text, use_dataset_max_lengths=True):
        source_vector_length = -1
        target_vector_length = -1
        if use_dataset_max_lengths:
            source_vector_length = self.max_source_length + 2
            target_vector_length = self.max_target_length + 1
        source_indices = self._get_source_indices(source_text)
        source_vector = self._vectorize(source_indices, 
                                        vector_length=source_vector_length, 
                                        mask_index=self.source_vocab.mask_index)
        target_x_indices, target_y_indices = self._get_target_indices(target_text)
        target_x_vector = self._vectorize(target_x_indices,
                                          vector_length=target_vector_length,
                                          mask_index=self.target_vocab.mask_index)
        target_y_vector = self._vectorize(target_y_indices,
                                          vector_length=target_vector_length,
                                          mask_index=self.target_vocab.mask_index)
        return {"source_vector": source_vector, 
                "target_x_vector": target_x_vector, 
                "target_y_vector": target_y_vector, 
                "source_length": len(source_indices)}
     
    # 通过新闻数据集创建一个向量化器
    @classmethod
    def from_dataframe(cls, bitext_df):
        source_vocab = SequenceVocabulary()
        target_vocab = SequenceVocabulary()
        max_source_length = max(map(len, bitext_df['English']))
        max_target_length = max(map(len, bitext_df['Chinese']))
        bitext_df = bitext_df.iloc[0:int(len(bitext_df)*0.7)]
        for _, row in bitext_df.iterrows():
            source_tokens = row["English"].split(" ")
            for token in source_tokens:
                source_vocab.add_token(token)
            target_tokens = row["Chinese"].split(" ")
            for token in target_tokens:
                target_vocab.add_token(token)
        return cls(source_vocab, target_vocab, max_source_length, max_target_length)

    # 通过使用contents(序列化后的初始化信息)重建实例
    @classmethod
    def from_serializable(cls, contents):
        source_vocab = SequenceVocabulary.from_serializable(contents["source_vocab"])
        target_vocab = SequenceVocabulary.from_serializable(contents["target_vocab"])
        return cls(source_vocab=source_vocab, 
                   target_vocab=target_vocab, 
                   max_source_length=contents["max_source_length"], 
                   max_target_length=contents["max_target_length"])

    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        return {"source_vocab": self.source_vocab.to_serializable(), 
                "target_vocab": self.target_vocab.to_serializable(), 
                "max_source_length": self.max_source_length,
                "max_target_length": self.max_target_length}
    

In [None]:
class NMTDataset(Dataset):
    
    '''创建一个新闻数据类来对数据进行向量化和分组'''
    
    def __init__(self, text_df, vectorizer):
        """
        Args:
            text_df: 翻译数据集
            vectorizer: 由训练集生成的向量化器
        """
        self.text_df = text_df
        self._vectorizer = vectorizer
        self.train_df = self.text_df.iloc[0:int(len(self.text_df)*0.7)]
        self.train_size = len(self.train_df)
        self.val_df = self.text_df.iloc[int(len(self.text_df)*0.7):int(len(self.text_df)*0.85)]
        self.val_size = len(self.val_df)
        self.test_df = self.text_df.iloc[int(len(self.text_df)*0.85):]
        self.test_size = len(self.test_df)
        # 将数据集分划后保存在dict中，通过set_split调取需要使用的数据集
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.val_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')
        
    # 通过数据集创建数据集实例
    @classmethod
    def load_dataset_and_make_vectorizer(cls, dataset_csv, sampling):
        text_df = pd.read_csv(dataset_csv)
        text_df = text_df.iloc[0:int(len(text_df)*sampling)]
        return cls(text_df, NMTVectorizer.from_dataframe(text_df))

    # 通过数据集以及保存好的向量化器来创建数据集实例
    @classmethod
    def load_dataset_and_load_vectorizer(cls, dataset_csv, vectorizer_filepath, sampling):
        text_df = pd.read_csv(dataset_csv)
        text_df = text_df.iloc[0:int(len(text_df)*sampling)]
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(text_df, vectorizer)
    
    # 从JSON文件中加载保存好的向量化器
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NMTVectorizer.from_serializable(json.load(fp))

    # 将向量化器保存到JSON文件中
    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    # 用于之后的vectorizer提取使用
    def get_vectorizer(self):
        return self._vectorizer

    # 根据情况选择当前要使用的数据集，默认使用训练集
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    # 定义数据集的长度，用于DataLoader的batch数量计算
    def __len__(self):
        return self._target_size
    
    # 定义数据集的输出，用于DataLoader的batch数据生成
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        vector_dict = self._vectorizer.vectorize(row['English'], row['Chinese'])
        return {"x_source": vector_dict["source_vector"], 
                "x_target": vector_dict["target_x_vector"],
                "y_target": vector_dict["target_y_vector"], 
                "x_source_length": vector_dict["source_length"]}

# 升级DataLoader的功能，将其输出批中的句子由长到短排序，方便pack_padded_sequence使用
def generate_nmt_batches(dataset, batch_size, shuffle=True, drop_last=True):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    for data_dict in dataloader:
        lengths = data_dict['x_source_length'].numpy()
        sorted_length_indices = lengths.argsort()[::-1].tolist()
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name][sorted_length_indices]
        yield out_data_dict
        

In [None]:
class NMTEncoder(nn.Module):
    
    '''创建一个编码器'''
    
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size):
        """
        Args:
            num_embeddings: 词嵌入矩阵的行数，等于词典单词的数量
            embedding_size: 词嵌入矩阵的维度，人为规定大小
            rnn_hidden_size: RNN的隐藏层大小
        """
        super(NMTEncoder, self).__init__()
        self.source_embedding = nn.Embedding(num_embeddings=num_embeddings,
                                             embedding_dim=embedding_size,
                                             padding_idx=0)
        self.birnn = nn.GRU(embedding_size, rnn_hidden_size, bidirectional=True, batch_first=True)
        
    # 输入数据批，返回每个时间步长的输出值以及最后的编码
    def forward(self, x_source, x_lengths):
        x_embedded = self.source_embedding(x_source)
        x_packed = pack_padded_sequence(x_embedded, x_lengths.detach().cpu().numpy(), batch_first=True)
        x_birnn_out, x_birnn_h = self.birnn(x_packed)
        x_birnn_h = x_birnn_h.permute(1, 0, 2)
        x_birnn_h = x_birnn_h.contiguous().view(x_birnn_h.size(0), -1)
        x_unpacked, _ = pad_packed_sequence(x_birnn_out, batch_first=True)
        return x_unpacked, x_birnn_h
        

In [None]:
# 创建注意力机制
def terse_attention(encoder_state_vectors, query_vector):
    vector_scores = torch.matmul(encoder_state_vectors, query_vector.unsqueeze(dim=2)).squeeze()
    vector_probabilities = F.softmax(vector_scores, dim=-1)
    context_vectors = torch.matmul(encoder_state_vectors.transpose(-2, -1), 
                                   vector_probabilities.unsqueeze(dim=2)).squeeze()
    return context_vectors, vector_probabilities


In [None]:
class NMTDecoder(nn.Module):
    
    '''创建一个解码器'''
    
    def __init__(self, num_embeddings, embedding_size, rnn_hidden_size, bos_index):
        """
        Args:
            num_embeddings: 词嵌入矩阵的行数，等于词典单词的数量
            embedding_size: 词嵌入矩阵的维度，人为规定大小
            rnn_hidden_size: RNN的隐藏层大小
            bos_index: 句子开头的索引
        """
        super(NMTDecoder, self).__init__()
        self._rnn_hidden_size = rnn_hidden_size
        self.target_embedding = nn.Embedding(num_embeddings=num_embeddings,
                                             embedding_dim=embedding_size,
                                             padding_idx=0)
        self.gru_cell = nn.GRUCell(embedding_size+rnn_hidden_size, rnn_hidden_size)
        self.hidden_map = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.classifier = nn.Linear(rnn_hidden_size * 2, num_embeddings)
        self.bos_index = bos_index
    
    # 生成由BOS组成的批数据，用于GRU的初始输入
    def _init_indices(self, batch_size):
        return torch.ones(batch_size, dtype=torch.int64) * self.bos_index
    
    # 生成初始化的注意力机制的上下文向量
    def _init_context_vectors(self, batch_size):
        return torch.zeros(batch_size, self._rnn_hidden_size)
    
    # 输入数据批，返回每个时间步长的输出值
    def forward(self, encoder_state, encoder_output, target_sequence, sample_probability=0.0):
        if target_sequence is None:
            sample_probability = 1.0
        else:
            target_sequence = target_sequence.permute(1, 0)
            output_sequence_size = target_sequence.size(0)
        h_t = self.hidden_map(encoder_output)
        batch_size = encoder_state.size(0)
        context_vectors = self._init_context_vectors(batch_size)
        y_t_index = self._init_indices(batch_size)
        output_vectors = []
        self._cached_p_attn = []
        self._cached_ht = []
        self._cached_decoder_state = encoder_state.cpu().detach().numpy()
        for i in range(output_sequence_size):
            use_sample = np.random.random() < sample_probability
            if not use_sample:
                y_t_index = target_sequence[i]
            y_input_vector = self.target_embedding(y_t_index)
            rnn_input = torch.cat([y_input_vector, context_vectors], dim=1)
            h_t = self.gru_cell(rnn_input, h_t)
            self._cached_ht.append(h_t.cpu().detach().numpy())
            context_vectors, p_attn = terse_attention(encoder_state_vectors=encoder_state,
                                                      query_vector=h_t)
            self._cached_p_attn.append(p_attn.cpu().detach().numpy())
            prediction_vector = torch.cat((context_vectors, h_t), dim=1)
            score_for_y_t_index = self.classifier(F.dropout(prediction_vector, 0.3))
            if use_sample:
                p_y_t_index = F.softmax(score_for_y_t_index, dim=1)
                y_t_index = torch.multinomial(p_y_t_index, 1).squeeze()
            output_vectors.append(score_for_y_t_index)
        output_vectors = torch.stack(output_vectors).permute(1, 0, 2)
        return output_vectors
            

In [None]:
class NMTModel(nn.Module):
    
    '''创建机器翻译模型'''
    
    def __init__(self, source_vocab_size, source_embedding_size,
                 target_vocab_size, target_embedding_size, encoding_size,
                 target_bos_index):
        """
        Args:
            source_vocab_size: 词嵌入矩阵的行数，等于词典单词的数量
            source_embedding_size: 词嵌入矩阵的维度，人为规定大小
            target_vocab_size: 词嵌入矩阵的行数，等于词典单词的数量
            target_embedding_size: 词嵌入矩阵的维度，人为规定大小
            encoding_size: 编码器的隐藏值向量大小
            target_bos_index: 翻译结果句子的开头索引
        """
        super(NMTModel, self).__init__()
        self.encoder = NMTEncoder(num_embeddings=source_vocab_size,
                                  embedding_size=source_embedding_size,
                                  rnn_hidden_size=encoding_size)
        decoding_size = encoding_size * 2
        self.decoder = NMTDecoder(num_embeddings=target_vocab_size,
                                  embedding_size=target_embedding_size,
                                  rnn_hidden_size=decoding_size,
                                  bos_index=target_bos_index)
        
    # 输入数据批，返回每个时间步长的输出值
    def forward(self, x_source, x_source_lengths, target_sequence, sample_probability=0.0):
        encoder_state, final_hidden_states = self.encoder(x_source, x_source_lengths)
        decoded_states = self.decoder(encoder_state=encoder_state,
                                      encoder_output=final_hidden_states,
                                      target_sequence=target_sequence,
                                      sample_probability=sample_probability)
        return decoded_states
        

In [None]:
# 用来跟踪模型的训练过程以及控制训练状态
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}


In [None]:
# 每轮训练结束将更新一次训练状态
def update_train_state(args, model, train_state):
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False
    elif train_state['epoch_index'] >= 1:
        loss_pre_t, loss_t = train_state['val_loss'][-2:]
        if loss_t >= loss_pre_t:
            train_state['early_stopping_step'] += 1
        else:
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t
            train_state['early_stopping_step'] = 0
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria
    return train_state


In [None]:
# 计算模型的准确度
def compute_accuracy(y_pred, y_true, mask_index):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    _, y_pred_indices = y_pred.max(dim=1)
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()
    return n_correct / n_valid * 100 


In [None]:
# 计算模型的损失值
def sequence_loss(y_pred, y_true, mask_index):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)


In [None]:
# 准备训练模型需要的所有工具
def prepare_training_process(args):
    # 设置概率种子
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # 初始化保存文件夹
    if not os.path.exists('news_folder/'):
        os.makedirs('news_folder/')
    # 准备数据集
    if os.path.exists(args.vectorizer_file):
        dataset = NMTDataset.load_dataset_and_load_vectorizer(
            args.nmt_csv, args.vectorizer_file, args.sampling)
    else:
        dataset = NMTDataset.load_dataset_and_make_vectorizer(args.nmt_csv, args.sampling)
        dataset.save_vectorizer(args.vectorizer_file)
    # 准备向量化器
    vectorizer = dataset.get_vectorizer()
    # 准备模型
    model = NMTModel(source_vocab_size=len(vectorizer.source_vocab), 
                     source_embedding_size=args.source_embedding_size, 
                     target_vocab_size=len(vectorizer.target_vocab),
                     target_embedding_size=args.target_embedding_size, 
                     encoding_size=args.encoding_size,
                     target_bos_index=vectorizer.target_vocab.begin_index)
    # 准备优化器
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    # 准备损失函数
    loss_func = sequence_loss
    # 准备学习率调整器
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
    return dataset, vectorizer, model, optimizer, loss_func, scheduler


In [None]:
# 训练模型，验证模型，测试模型，保存模型
def train_model(args, dataset, model, optimizer, loss_func, scheduler):
    mask_index = dataset.get_vectorizer().target_vocab.mask_index
    train_state = make_train_state(args)
    try:
        for epoch_index in range(args.num_epochs):
            sample_probability = (20 + epoch_index) / args.num_epochs
            train_state['epoch_index'] = epoch_index
            # 训连模型并将每轮训练结果用于更新状态
            dataset.set_split('train')
            batch_generator = generate_nmt_batches(dataset, batch_size=args.batch_size)
            running_loss = 0.0
            running_acc = 0.0
            model.train()
            for batch_index, batch_dict in enumerate(batch_generator):
                y_pred = model(batch_dict['x_source'], batch_dict['x_source_length'], 
                           batch_dict['x_target'], sample_probability=sample_probability)
                loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += (loss.item() - running_loss) / (batch_index + 1)
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
                running_acc += (acc_t - running_acc) / (batch_index + 1)
            train_state['train_loss'].append(running_loss)
            train_state['train_acc'].append(running_acc)
            # 验证模型并将验证结果用于更新状态
            dataset.set_split('val')
            batch_generator = generate_nmt_batches(dataset, batch_size=args.batch_size)
            running_loss = 0.0
            running_acc = 0.0
            model.eval()
            for batch_index, batch_dict in enumerate(batch_generator):
                y_pred = model(batch_dict['x_source'], batch_dict['x_source_length'], 
                           batch_dict['x_target'], sample_probability=sample_probability)
                loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
                running_loss += (loss.item() - running_loss) / (batch_index + 1)
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
                running_acc += (acc_t - running_acc) / (batch_index + 1)
            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)
            # 更新训练状态
            train_state = update_train_state(args=args, model=model, train_state=train_state)
            # 更新学习率
            scheduler.step(train_state['val_loss'][-1])
            # 打印每轮训练的结果
            print("Epoch: {} / {} -- Train Accuracy: {:.3f}, Val Accuracy: {:.3f}".format(
                train_state['epoch_index']+1, args.num_epochs, train_state['train_acc'][-1], 
                train_state['val_acc'][-1]))
            # 判断是否提前结速训练
            if train_state['stop_early']:
                print('Early Stop Training!')
                break
        # 使用测试集测试训练好的模型，更新状态中的测试结果
        model.load_state_dict(torch.load(train_state['model_filename']))
        dataset.set_split('test')
        batch_generator = generate_nmt_batches(dataset, batch_size=args.batch_size)
        running_loss = 0.0
        running_acc = 0.0
        model.eval()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(batch_dict['x_source'], batch_dict['x_source_length'], 
                           batch_dict['x_target'], sample_probability=sample_probability)
            loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
        train_state['test_loss'] = running_loss
        train_state['test_acc'] = running_acc
        print("Test Accuracy: {:.3f}".format(train_state['test_acc']))
    except KeyboardInterrupt:
        print("Exiting loop")
        

In [None]:
# 训练分类器模型，并保存到文件夹中
dataset, vectorizer, model, optimizer, loss_func, scheduler = prepare_training_process(args)
train_model(args, dataset, model, optimizer, loss_func, scheduler)

In [None]:
def get_source_sentence(vectorizer, batch_dict, index):
    indices = batch_dict['x_source'][index].cpu().data.numpy()
    vocab = vectorizer.source_vocab
    return sentence_from_indices(indices, vocab)

def get_true_sentence(vectorizer, batch_dict, index):
    return sentence_from_indices(batch_dict['y_target'].cpu().data.numpy()[index], vectorizer.target_vocab)
    
def get_sampled_sentence(vectorizer, batch_dict, index):
    y_pred = model(x_source=batch_dict['x_source'], 
                   x_source_lengths=batch_dict['x_source_length'], 
                   target_sequence=batch_dict['x_target'], 
                   sample_probability=1.0)
    return sentence_from_indices(torch.max(y_pred, dim=2)[1].cpu().data.numpy()[index], vectorizer.target_vocab)

def get_all_sentences(vectorizer, batch_dict, index):
    return {"source": get_source_sentence(vectorizer, batch_dict, index), 
            "truth": get_true_sentence(vectorizer, batch_dict, index), 
            "sampled": get_sampled_sentence(vectorizer, batch_dict, index)}
    
def sentence_from_indices(indices, vocab, strict=True):
    out = []
    for index in indices:
        if index == vocab.begin_index and strict:
            continue
        elif index == vocab.end_index and strict:
            return " ".join(out)
        else:
            out.append(vocab.lookup_index(index))
    return " ".join(out)


In [None]:
def translate_result(args, dataset, sample_size):
    dataset.set_split('test')
    batch_generator = generate_nmt_batches(dataset, batch_size=args.batch_size)
    batch_dict = next(batch_generator)
    result = []
    for i in range(sample_size):
        results = get_all_sentences(vectorizer, batch_dict, i)
        result.append(results)
    return pd.DataFrame(result)

In [None]:
translate_result(args, dataset, 5)