In [None]:
from argparse import Namespace # 管理所有超参数
from collections import Counter # 统计字的数量
import string # 标点符号调用
import re # 正则表达式
import os # 生成文件路径
import json # 保存模型为JSON格式
import numpy as np # 数据处理
import pandas as pd # 文本处理
import torch # 调用PyTorch库
import torch.nn as nn # 调用神经网络层
import torch.nn.functional as F # 调用激活函数
import torch.optim as optim # 调用优化器
from torch.utils.data import Dataset, DataLoader # 调用批生成器

In [None]:
args = Namespace(
    news_csv="20200913-Top10-clean-char", # 数据集
    vectorizer_file="news_folder/vectorizer1.json", # 向量化器保存的位置
    model_state_file="news_folder/model1.pth", # 模型保存的位置
    predicted_file="news_folder/predicted1.csv", # 预测文件的位置
    char_embedding_size=100, # 字向量维度大小
    rnn_hidden_size=64, # RNN的隐藏层大小
    num_epochs=20, # 模型训练轮数
    learning_rate=1e-3, # 学习率
    batch_size=64, # 批的大小
    seed=1337, # 设置种子
    early_stopping_criteria=3, # 超过未优化次数将停止训练
    sampling=0.1, # 用CPU训练时取样50%数据用于训练
    cutoff=1, # 设置字典中字的最小频率
    dropout=0.5 # dropout的概率
)

In [None]:
class Vocabulary(object):
    
    '''创建一个字典类来管理数据集中每个字和对应索引的关系'''
    
    def __init__(self, token_to_idx={}):
        '''
        Args:
            token_to_idx: 载入预先生成好的字典，若没有会自动生成空字典
        '''
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
    
    # 向双向字典中加入令牌，并返回令牌在字典中所在的索引，若令牌已存在，直接返回索引
    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    # 查找令牌在字典中的对应索引
    def lookup_token(self, token):
        return self._token_to_idx[token]

    # 查找索引在字典中对应的令牌，若索引不存在将报错
    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx}

    # 通过使用contents(序列化后的初始化信息)重建实例
    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    # Print打印实例的输出结果
    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    # 定义实例的长度信息为字典的长度
    def __len__(self):
        return len(self._token_to_idx)
    

In [None]:
class SequenceVocabulary(Vocabulary):
    
    '''创建一个字典类来管理数据集中每个字和对应索引的关系'''

    def __init__(self, token_to_idx={}, unk_token='<UNK>', mask_token='<MASK>',
                 begin_token='<BEGIN>', end_token='<END>'):
        '''
        Args:
            token_to_idx: 载入预先生成好的字典，若没有会自动生成空字典
            unk_token，mask_token，begin_token, end_token: 文本中的特殊令牌
        '''
        super(SequenceVocabulary, self).__init__(token_to_idx)
        
        # 保存特殊的令牌
        self._mask_token = mask_token
        self._unk_token = unk_token
        self._begin_token = begin_token
        self._end_token = end_token
        
        # 将特殊令牌添加到字典中，并保存对应的索引
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = self.add_token(self._unk_token)
        self.begin_index = self.add_token(self._begin_token)
        self.end_index = self.add_token(self._end_token)

    # 查找令牌在字典中对应的索引，如果令牌不存在，则返回UNK索引
    def lookup_token(self, token):
        return self._token_to_idx.get(token, self.unk_index)
    
    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        contents = super(SequenceVocabulary, self).to_serializable()
        contents.update({'unk_token': self._unk_token,
                         'mask_token': self._mask_token,
                         'begin_token': self._begin_token,
                         'end_token': self._end_token})
        return contents
    

In [None]:
class NewsVectorizer(object):
    
    '''创建一个向量化器类将文本句子转换为句子索引向量'''
    
    def __init__(self, char_vocab, label_vocab):
        '''Args:
               char_vocab: 包含数据集中所有文本的字典
               label_vocab: 包含数据集中所有标签的字典
        '''
        self.char_vocab = char_vocab
        self.label_vocab = label_vocab
        
    # 向量化文本句子，将句子中的每个字用索引表示，生成用于训练和预测的句子索引向量
    def vectorize(self, news_title, vector_length=-1):
        indices = [self.char_vocab.begin_index]
        indices.extend([self.char_vocab.lookup_token(token) 
                       for token in news_title])
        indices.append(self.char_vocab.end_index)
        if vector_length == -1:
            vector_length = len(indices)
        from_vector = np.empty(vector_length, dtype=np.int64)         
        from_indices = indices[:-1]
        from_vector[:len(from_indices)] = from_indices
        from_vector[len(from_indices):] = self.char_vocab.mask_index
        to_vector = np.empty(vector_length, dtype=np.int64)
        to_indices = indices[1:]
        to_vector[:len(to_indices)] = to_indices
        to_vector[len(to_indices):] = self.char_vocab.mask_index
        return from_vector, to_vector
    
    # 通过新闻数据集创建一个向量化器
    @classmethod
    def from_dataframe(cls, news_df, cutoff=25):
        label_vocab = Vocabulary()
        for label in sorted(set(news_df['一级类目'])):
            label_vocab.add_token(label)
        char_counts = Counter()
        for title in news_df['新闻标题']:
            for token in title:
                if token not in string.punctuation:
                    char_counts[token] += 1 
        char_vocab = SequenceVocabulary()
        for char, char_count in char_counts.items():
            if char_count >= cutoff:
                char_vocab.add_token(char)
        return cls(char_vocab, label_vocab)

    # 生成序列化信息，方便使用JSON保存初始化信息
    def to_serializable(self):
        return {'char_vocab': self.char_vocab.to_serializable(), 
                'label_vocab': self.label_vocab.to_serializable()}
    
    # 通过使用contents(序列化后的初始化信息)重建实例
    @classmethod
    def from_serializable(cls, contents):
        char_vocab = SequenceVocabulary.from_serializable(contents['char_vocab'])
        label_vocab = Vocabulary.from_serializable(contents['label_vocab'])
        return cls(char_vocab, label_vocab)
    

In [None]:
class NewsDataset(Dataset):
    
    '''创建一个新闻数据类来对数据进行向量化和分组'''
    
    def __init__(self, news_df, vectorizer):
        '''
        Args:
            news_df: 新闻数据集
            vectorizer: 由训练集生成的向量化器
        '''
        self.news_df = news_df
        self._vectorizer = vectorizer
        # 计算数据集中最长文本的长度，用于之后的句子向量化
        self._max_seq_length = max(map(len, self.news_df["新闻标题"])) + 2
        self.train_df = self.news_df.iloc[0:int(len(self.news_df)*0.7)]
        self.train_size = len(self.train_df)
        self.val_df = self.news_df.iloc[int(len(self.news_df)*0.7):int(len(self.news_df)*0.85)]
        self.val_size = len(self.val_df)
        self.test_df = self.news_df.iloc[int(len(self.news_df)*0.85):]
        self.test_size = len(self.test_df)
        # 将数据集分划后保存在dict中，通过set_split调取需要使用的数据集
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.val_size),
                             'test': (self.test_df, self.test_size)}
        self.set_split('train')
        
        # 由于一级类目的样本不平衡而计算的样本权重，用于softmax加权
        class_counts = self.train_df['一级类目'].value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.label_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    # 根据情况选择当前要使用的数据集，默认使用训练集
    def set_split(self, split="train"):
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]
         
    # 定义数据集的长度，用于DataLoader的batch数量计算
    def __len__(self):
        return self._target_size
        
    # 定义数据集的输出，用于DataLoader的batch数据生成
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        from_vector, to_vector = \
        self._vectorizer.vectorize(row["新闻标题"], self._max_seq_length)
        label_index = \
        self._vectorizer.label_vocab.lookup_token(row["一级类目"])
        return {'x_data': from_vector, 
                'y_target': to_vector, 
                'label_index': label_index}
        
    # 用于之后的vectorizer提取使用
    def get_vectorizer(self):
        return self._vectorizer
        
    # 通过新闻数据集创建数据集实例
    @classmethod
    def load_dataset_and_make_vectorizer(cls, news_csv, sampling):
        news_df = pd.read_csv(news_csv)
        news_df = news_df.iloc[0:int(len(news_df)*sampling)]
        train_df = news_df.iloc[0:int(len(news_df)*0.7)]
        return cls(news_df, NewsVectorizer.from_dataframe(train_df))
        
    # 通过数据集以及保存好的向量化器来创建数据集实例
    @classmethod
    def load_dataset_and_load_vectorizer(cls, news_csv, vectorizer_filepath, sampling):
        news_df = pd.read_csv(news_csv)
        news_df = news_df.iloc[0:int(len(news_df)*sampling)]
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(news_df, vectorizer)
        
    # 从JSON文件中加载保存好的向量化器
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        with open(vectorizer_filepath) as fp:
            return NewsVectorizer.from_serializable(json.load(fp))
        
    # 将向量化器保存到JSON文件中
    def save_vectorizer(self, vectorizer_filepath):
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)
                    

In [None]:
class NewsGenerationModel(nn.Module):
    
    '''创建新闻序列生成模型'''
    
    def __init__(self, char_vocab_size, char_embedding_size, rnn_hidden_size,
                 num_labels, padding_idx=0, batch_first=True, dropout_p=0.5): 
        '''
        Args:
            char_vocab_size: 字嵌入矩阵的行数，等于字典中字的数量
            char_embedding_size: 字嵌入矩阵的维度，人为规定大小
            rnn_hidden_size: RNN的隐藏层大小
            num_labels: 标签的个数
            padding_idx: 将某个index对应的令牌作为padding对象，默认为MASk
            batch_first: batch是否为数据集的第0维 
            dropout_p: 正则化概率
        '''
        super(NewsGenerationModel, self).__init__()
        self.char_emb = nn.Embedding(num_embeddings=char_vocab_size,
                                     embedding_dim=char_embedding_size,
                                     padding_idx=padding_idx)
        self.label_emb = nn.Embedding(num_embeddings=num_labels,
                                      embedding_dim=rnn_hidden_size)
        self.rnn = nn.GRU(input_size=char_embedding_size,
                          hidden_size=rnn_hidden_size,
                          batch_first=batch_first)
        self.fc = nn.Linear(in_features=rnn_hidden_size,
                            out_features=char_vocab_size)
        self._dropout_p = dropout_p
    
    # 输入数据批，返回每一个时间步长上的隐藏层数值
    def forward(self, x_in, label_index, apply_softmax=False):
        x_embedded = self.char_emb(x_in)
        label_embedded = self.label_emb(label_index)
        y_out, _ = self.rnn(x_embedded, label_embedded)
        batch_size, seq_size, feat_size = y_out.shape
        y_out = y_out.contiguous().view(batch_size * seq_size, feat_size)
        y_out = self.fc(F.dropout(y_out, p=self._dropout_p))                
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
        new_feat_size = y_out.shape[-1]
        y_out = y_out.view(batch_size, seq_size, new_feat_size)
        return y_out
    

In [None]:
# 用来跟踪模型的训练过程以及控制训练状态
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}


In [None]:
# 每轮训练结束将更新一次训练状态
def update_train_state(args, model, train_state):
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False
    elif train_state['epoch_index'] >= 1:
        loss_pre_t, loss_t = train_state['val_loss'][-2:]
        if loss_t >= loss_pre_t:
            train_state['early_stopping_step'] += 1
        else:
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t
            train_state['early_stopping_step'] = 0
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria
    return train_state


In [None]:
# 计算模型的准确度
def compute_accuracy(y_pred, y_true, mask_index):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    _, y_pred_indices = y_pred.max(dim=1)
    correct_indices = torch.eq(y_pred_indices, y_true).float()
    valid_indices = torch.ne(y_true, mask_index).float()
    n_correct = (correct_indices * valid_indices).sum().item()
    n_valid = valid_indices.sum().item()
    return n_correct / n_valid * 100 


In [None]:
# 计算模型的损失值
def sequence_loss(y_pred, y_true, mask_index):
    if len(y_pred.size()) == 3:
        y_pred = y_pred.contiguous().view(-1, y_pred.size(2))
    if len(y_true.size()) == 2:
        y_true = y_true.contiguous().view(-1)
    return F.cross_entropy(y_pred, y_true, ignore_index=mask_index)


In [None]:
# 准备训练模型需要的所有工具
def prepare_training_process(args):
    # 设置概率种子
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    # 初始化保存文件夹
    if not os.path.exists('news_folder/'):
        os.makedirs('news_folder/')
    # 准备数据集
    if os.path.exists(args.vectorizer_file):
        dataset = NewsDataset.load_dataset_and_load_vectorizer(
            args.news_csv, args.vectorizer_file, args.sampling)
    else:
        dataset = NewsDataset.load_dataset_and_make_vectorizer(args.news_csv, args.sampling)
        dataset.save_vectorizer(args.vectorizer_file)
    # 准备向量化器
    vectorizer = dataset.get_vectorizer()
    # 准备分类器
    model = NewsGenerationModel(
                 char_embedding_size=args.char_embedding_size,
                 char_vocab_size=len(vectorizer.char_vocab),
                 rnn_hidden_size=args.rnn_hidden_size,
                 padding_idx=vectorizer.char_vocab.mask_index,
                 num_labels=len(vectorizer.label_vocab))
    # 准备优化器
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
    # 准备损失函数
    loss_func = sequence_loss
    # 准备学习率调整器
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='min', factor=0.5, patience=1)
    return dataset, vectorizer, model, optimizer, loss_func, scheduler


In [None]:
# 训练模型，验证模型，测试模型，保存模型
def train_model(args, dataset, model, optimizer, loss_func, scheduler):
    mask_index = dataset.get_vectorizer().char_vocab.mask_index
    train_state = make_train_state(args)
    try:
        for epoch_index in range(args.num_epochs):
            train_state['epoch_index'] = epoch_index
            # 训连模型并将每轮训练结果用于更新状态
            dataset.set_split('train')
            batch_generator = DataLoader(dataset=dataset, batch_size=args.batch_size)
            running_loss = 0.0
            running_acc = 0.0
            model.train()
            for batch_index, batch_dict in enumerate(batch_generator):
                y_pred = model(x_in=batch_dict['x_data'], label_index=batch_dict['label_index'])
                loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                running_loss += (loss.item() - running_loss) / (batch_index + 1)
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
                running_acc += (acc_t - running_acc) / (batch_index + 1)
            train_state['train_loss'].append(running_loss)
            train_state['train_acc'].append(running_acc)
            # 验证模型并将验证结果用于更新状态
            dataset.set_split('val')
            batch_generator = DataLoader(dataset=dataset, batch_size=args.batch_size)
            running_loss = 0.0
            running_acc = 0.0
            model.eval()
            for batch_index, batch_dict in enumerate(batch_generator):
                y_pred = model(x_in=batch_dict['x_data'], label_index=batch_dict['label_index'])
                loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
                running_loss += (loss.item() - running_loss) / (batch_index + 1)
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
                running_acc += (acc_t - running_acc) / (batch_index + 1)
            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)
            # 更新训练状态
            train_state = update_train_state(args=args, model=model, train_state=train_state)
            # 更新学习率
            scheduler.step(train_state['val_loss'][-1])
            # 打印每轮训练的结果
            print("Epoch: {} / {} -- Train Accuracy: {:.3f}, Val Accuracy: {:.3f}".format(
                train_state['epoch_index']+1, args.num_epochs, train_state['train_acc'][-1], 
                train_state['val_acc'][-1]))
            # 判断是否提前结速训练
            if train_state['stop_early']:
                print('Early Stop Training!')
                break
        # 使用测试集测试训练好的模型，更新状态中的测试结果
        model.load_state_dict(torch.load(train_state['model_filename']))
        dataset.set_split('test')
        batch_generator = DataLoader(dataset=dataset, batch_size=args.batch_size)
        running_loss = 0.0
        running_acc = 0.0
        model.eval()
        for batch_index, batch_dict in enumerate(batch_generator):
            y_pred = model(batch_dict['x_data'], label_index=batch_dict['label_index'])
            loss = loss_func(y_pred, batch_dict['y_target'], mask_index)
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'], mask_index)
            running_acc += (acc_t - running_acc) / (batch_index + 1)
        train_state['test_loss'] = running_loss
        train_state['test_acc'] = running_acc
        print("Test Accuracy: {:.3f}".format(train_state['test_acc']))
    except KeyboardInterrupt:
        print("Exiting loop")
        

In [None]:
# 训练分类器模型，并保存到文件夹中
dataset, vectorizer, model, optimizer, loss_func, scheduler = prepare_training_process(args)
train_model(args, dataset, model, optimizer, loss_func, scheduler)

In [None]:
# 随机生成一定数量的新闻标题的索引句子
def sample_from_model(model, vectorizer, label_indices, num_samples=1, sample_size=20):
    begin_index = [vectorizer.char_vocab.begin_index for _ in range(num_samples)]
    begin_index = torch.tensor(begin_index, dtype=torch.int64).unsqueeze(dim=1)
    indices = [begin_index]
    label_indices = torch.tensor(label_indices, dtype=torch.int64).unsqueeze(dim=0)
    h_t = model.label_emb(label_indices)
    for time_step in range(sample_size):
        x_t = indices[time_step]
        x_emb_t = model.char_emb(x_t)
        rnn_out_t, h_t = model.rnn(x_emb_t, h_t)
        prediction_vector = model.fc(rnn_out_t.squeeze(dim=1))
        probability_vector = F.softmax(prediction_vector, dim=1)
        indices.append(torch.multinomial(probability_vector, num_samples=1))
    indices = torch.stack(indices).squeeze().permute(1, 0)
    return indices
            

In [None]:
# 将索引句子解码为新闻标题文字
def decode_samples(sampled_indices, vectorizer):
    decoded_surnames = []
    vocab = vectorizer.char_vocab
    for sample_index in range(sampled_indices.shape[0]):
        surname = ""
        for time_step in range(sampled_indices.shape[1]):
            sample_item = sampled_indices[sample_index, time_step].item()
            if sample_item == vocab.begin_index:
                continue
            elif sample_item == vocab.end_index:
                break
            else:
                surname += vocab.lookup_index(sample_item)
        decoded_surnames.append(surname)
    return decoded_surnames


In [None]:
num_titles = 10
sampled_surnames = decode_samples(
    sample_from_model(model, vectorizer, num_samples=num_titles), 
    vectorizer)
print ("-"*15)
for i in range(num_titles):
    print(sampled_surnames[i])