## 数据预处理

In [1]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.backends.mps.is_available())

2.3.1+cu121
True
False


In [2]:
import torch
import time
from datetime import timedelta
import os
import pickle as pkl
from transformers import BertTokenizer
from tqdm import tqdm


### 1. 分词和构建词表

In [None]:
UNK, PAD, CLS = "[UNK]", "[PAD]", "[CLS]"  # 特殊符号
MAX_VOCAB_SIZE = 10000  # 词表长度限制

# 定义构建词表的函数
def build_vocab(file_path, tokenizer, max_size, min_freq):
    vocab_dic = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, desc="构建词表中"):
            line = line.strip()
            if not line:
                continue
            content = line.split('\t')[0]  # 假设文本在每行的第一列
            for word in tokenizer.tokenize(content):  # 正确调用 tokenize 方法
                vocab_dic[word] = vocab_dic.get(word, 0) + 1

    # 根据频率和词表大小筛选词汇
    vocab_list = sorted(
        [(word, count) for word, count in vocab_dic.items() if count >= min_freq],
        key=lambda x: x[1],
        reverse=True
    )[:max_size]

    # 创建词表字典并加入特殊符号
    vocab_dic = {word_count[0]: idx for idx, word_count in enumerate(vocab_list)}
    vocab_dic.update({UNK: len(vocab_dic), PAD: len(vocab_dic) + 1, CLS: len(vocab_dic) + 2})
    return vocab_dic

# 使用 transformers 的 BertTokenizer 加载中文分词器
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained('./bert_pretrain')

# 构建词表
vocab_dic = build_vocab('./data/train.txt', tokenizer=tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
print("生成的词表大小:", len(vocab_dic))
# print(vocab_dic.items())

### 2. 构建数据集

In [5]:
def build_dataset(config):
    """
    根据提供的配置文件加载训练集、验证集和测试集，并对数据进行预处理。
    Args:
        config: 包含配置信息的对象，包含路径、分词器、pad_size 等。
    Returns:
        train, dev, test: 预处理后的训练集、验证集和测试集。
    """
    def load_dataset(path, pad_size=32):
        """
        加载并预处理单个数据集。
        Args:
            path: 数据文件路径。
            pad_size: 序列的最大长度。如果小于 pad_size，则进行填充；如果大于，则截断。
        Returns:
            contents: 包含 (token_ids, label, seq_len, mask) 的数据列表。
        """
        contents = []
        with open(path, "r", encoding='utf-8') as f:
            for line in tqdm(f):  # tqdm 用于显示进度条
                line = line.strip()
                if not line:
                    continue  # 跳过空行
                # 数据格式假定为 '文本\t标签'
                content, label = line.split('\t')
                token = config.tokenizer.tokenize(content)  # 分词操作
                token = [CLS] + token  # 在序列开头添加特殊标记 [CLS]
                seq_len = len(token)   # 序列长度
                token_ids = config.tokenizer.convert_tokens_to_ids(token)   # 转换为 ID
                mask = []   # 构建 mask 和 padding
                if pad_size:
                    if len(token) < pad_size:
                        # 如果序列长度不足 pad_size，填充 0
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += [0] * (pad_size - len(token))
                    else:
                        # 如果序列长度超过 pad_size，进行截断
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size

                # 将处理后的数据添加到结果列表
                contents.append((token_ids, int(label), seq_len, mask))

        return contents

    # 加载训练集、验证集和测试集
    train = load_dataset(config.train_path, config.pad_size)
    dev = load_dataset(config.dev_path, config.pad_size)
    test = load_dataset(config.test_path, config.pad_size)
    return train, dev, test
                

### 3. 数据封装

In [28]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device, model_name):
        self.batch_size = batch_size
        self.batches = batches
        self.model_name = model_name
        self.n_batches = len(batches) // batch_size
        self.residue = False    # 记录batch数量是否为整数
        if len(batches) % self.n_batches != 0:
            self.residue = True # batches不能被batch_size整除
        self.index = 0
        self.device = device
    
    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        # pad 前的长度（超过pad_size的设置为pad_size）
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        if self.model_name == 'bert' or self.model_name == 'multi_task_bert':
            mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
            return (x, seq_len, mask), y
    
    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size : (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
    
    def __iter__(self):
        return self
    
    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches

def build_iterator(dataset, config):
    iter = DatasetIterater(dataset, config.batch_size, config.device, config.model_name)
    return iter

In [7]:
def get_time_dif(start_time):
    # 获取已使用时间
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

## BERT分类模型搭建

### 1. 实现Config类代码

In [8]:
import torch
import torch.nn as nn
import os
from transformers import BertModel, BertTokenizer, BertConfig

class Config(object):
    def __init__(self, dataset):
        self.model_name = 'bert'
        self.data_path = './data/'
        self.train_path = self.data_path + "train.txt"  # 训练集
        self.dev_path = self.data_path + "dev.txt"  # 验证集
        self.test_path = self.data_path + "test.txt"    # 测试集
        self.class_list = [
            x.strip() for x in open(self.data_path + "class.txt").readlines()
        ]   # 类别名单
        self.save_path = './cache'
        if not os.path.exists(self.save_path):
            os.mkdir(self.save_path)
        self.save_path += "/" + self.model_name + ".pt" # 模型训练结果
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.require_improvement = 1000 # 若超过1000batch效果还没有提升，则提前结束训练
        self.num_classes = len(self.class_list) # 类别数
        self.num_epochs = 3 # epoch数
        self.batch_size = 128   # mini-batch大小
        self.pad_size = 32  # 每句话处理成的长度（短补长截）
        self.learning_rate = 5e-5   # 学习率
        self.bert_path = './bert_pretrain'
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_path)
        self.bert_config = BertConfig.from_pretrained(self.bert_path + '/bert_config.json')
        self.hidden_size = 768


2024-12-29 10:45:04.808890: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-29 10:45:05.787816: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
config = Config('toutiao')
# 构建词表
vocab_dic = build_vocab('./data/test.txt', tokenizer=config.tokenizer, max_size=MAX_VOCAB_SIZE, min_freq=1)
print("生成的词表大小:", len(vocab_dic))
print(config.device, config.train_path)

构建词表中: 10000it [00:01, 8193.30it/s]

生成的词表大小: 4676
cuda ./data/train.txt





### 2. 实现Model类

In [35]:
from transformers import AutoModel

class Model(nn.Module):
    def __init__(self, config):
        super(Model, self).__init__()
        # self.bert = BertModel.from_pretrained("bert-base-chinese")
        # self.bert = AutoModel.from_pretrained("bert-base-chinese")
        self.bert = BertModel.from_pretrained(config.bert_path, config=config.bert_config)
        self.fc = nn.Linear(config.hidden_size, config.num_classes)

    def forward(self, x):
        context = x[0]
        mask = x[2]
        # _, pooled = self.bert(context, attention_mask=mask)
        # out = self.fc(pooled)
        
        outputs = self.bert(context, attention_mask=mask)
        pooled = outputs.pooler_output
        out = self.fc(pooled)

        return out


## 编写训练、测试、评估函数

### 1. 训练函数

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn import metrics
import time
# from utils import get_time_dif
from torch.optim import AdamW
from tqdm import tqdm
import math
import logging

In [38]:
def train(config, model, train_iter, dev_iter):
    start_time = time.time()
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.01
        },
        {
            "params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0
        }
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
    loss_fn = nn.CrossEntropyLoss()  # 损失函数定义一次

    total_batch = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升，用于判断是否早停

    model.train()
    for epoch in range(config.num_epochs):
        print(f"Epoch [{epoch + 1}/{config.num_epochs}]")
        for i, (trains, labels) in enumerate(tqdm(train_iter)):

            outputs = model(trains)
            model.zero_grad()
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()


            if total_batch % 200 == 0 and total_batch != 0:
                # 每 200 轮输出在训练集和验证集上的效果
                true = labels.data.cpu()
                predict = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predict)
                dev_acc, dev_loss = evaluate(config, model, dev_iter)
                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss  # 更新最佳验证集loss
                    torch.save(model.state_dict(), config.save_path)  # 保存模型
                    improve = '*'
                    last_improve = total_batch
                else:
                    improve = ""
                time_dif = get_time_dif(start_time)
                msg = (f"Epoch: {epoch + 1}, Batch: {i + 1}, Iter: {total_batch}, "
                       f"Train Loss: {loss.item():.2f}, Train Acc: {train_acc:.2%}, "
                       f"Val Loss: {dev_loss:.2f}, Val Acc: {dev_acc:.2%}, Time: {time_dif} {improve}")
                print(msg)
                model.train()

            total_batch += 1

            if total_batch - last_improve > config.require_improvement:
                # 验证集loss超过指定batch没有下降，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break


    

### 2. 测试函数

In [16]:
def test(config, model, test_iter):
    # model.load_state_dict(torch.load(config.save_path))
    # 采用量化模型进行推理时需要关闭
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(config, model, test_iter, test=True)
    
    msg = "Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}"
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

### 3. 验证函数

In [17]:
def evaluate(config, model, data_iter, test=False):
    # 采用量化模型进行推理时需要关闭
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data_iter:
            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predict = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predict)
    acc = metrics.accuracy_score(labels_all, predict_all)
    if test:
        report = metrics.classification_report(labels_all, predict_all, target_names=config.class_list, digits=4)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    return acc, loss_total / len(data_iter)

## 运行主函数

In [18]:
import time
import torch
import numpy as np
# from train_eval import train, test
from importlib import import_module
import argparse
# from utils import build_dataset, build_iterator, get_time_dif

### 1. 加载和处理数据

In [32]:
dataset = "toutiao" # 数据集
config = Config(dataset)
print(config.train_path)

print("Loading data for Bert Model...")
train_data, dev_data, test_data = build_dataset(config)
train_iter = build_iterator(train_data, config)
dev_iter = build_iterator(dev_data, config)
test_iter = build_iterator(test_data, config)

./data/train.txt
Loading data for Bert Model...


180000it [00:30, 5993.65it/s]
10000it [00:02, 4746.87it/s]
10000it [00:01, 5976.95it/s]


### 2. 实例化模型

In [39]:
model = Model(config).to(config.device)
print(model)

Model(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

### 3. 模型训练

In [40]:
train(config, model, train_iter, dev_iter)

Epoch [1/3]


 14%|█▍        | 201/1407 [00:49<46:52,  2.33s/it]

Epoch: 1, Batch: 201, Iter: 200, Train Loss: 0.20, Train Acc: 94.53%, Val Loss: 0.31, Val Acc: 90.38%, Time: 0:00:50 *


 29%|██▊       | 401/1407 [01:41<43:58,  2.62s/it]

Epoch: 1, Batch: 401, Iter: 400, Train Loss: 0.28, Train Acc: 92.97%, Val Loss: 0.27, Val Acc: 91.88%, Time: 0:01:41 *


 43%|████▎     | 601/1407 [02:31<31:56,  2.38s/it]

Epoch: 1, Batch: 601, Iter: 600, Train Loss: 0.32, Train Acc: 92.19%, Val Loss: 0.27, Val Acc: 91.51%, Time: 0:02:32 *


 57%|█████▋    | 801/1407 [03:20<19:18,  1.91s/it]

Epoch: 1, Batch: 801, Iter: 800, Train Loss: 0.26, Train Acc: 92.19%, Val Loss: 0.27, Val Acc: 91.75%, Time: 0:03:21 


 59%|█████▉    | 834/1407 [03:27<02:22,  4.01it/s]


Epoch [2/3]


 12%|█▏        | 167/1407 [00:44<55:52,  2.70s/it]

Epoch: 2, Batch: 167, Iter: 1000, Train Loss: 0.29, Train Acc: 89.84%, Val Loss: 0.24, Val Acc: 92.38%, Time: 0:04:13 *


 26%|██▌       | 367/1407 [01:35<39:52,  2.30s/it]

Epoch: 2, Batch: 367, Iter: 1200, Train Loss: 0.41, Train Acc: 87.50%, Val Loss: 0.23, Val Acc: 92.61%, Time: 0:05:03 *


 40%|████      | 567/1407 [02:27<37:39,  2.69s/it]

Epoch: 2, Batch: 567, Iter: 1400, Train Loss: 0.24, Train Acc: 89.84%, Val Loss: 0.21, Val Acc: 93.18%, Time: 0:05:55 *


 55%|█████▍    | 767/1407 [03:16<20:30,  1.92s/it]

Epoch: 2, Batch: 767, Iter: 1600, Train Loss: 0.22, Train Acc: 92.97%, Val Loss: 0.22, Val Acc: 93.04%, Time: 0:06:45 


 69%|██████▊   | 967/1407 [04:06<14:05,  1.92s/it]

Epoch: 2, Batch: 967, Iter: 1800, Train Loss: 0.14, Train Acc: 95.31%, Val Loss: 0.22, Val Acc: 93.43%, Time: 0:07:34 


 83%|████████▎ | 1167/1407 [04:55<07:41,  1.92s/it]

Epoch: 2, Batch: 1167, Iter: 2000, Train Loss: 0.09, Train Acc: 97.66%, Val Loss: 0.23, Val Acc: 93.00%, Time: 0:08:24 


 97%|█████████▋| 1367/1407 [05:45<01:16,  1.92s/it]

Epoch: 2, Batch: 1367, Iter: 2200, Train Loss: 0.05, Train Acc: 99.22%, Val Loss: 0.22, Val Acc: 93.02%, Time: 0:09:13 


100%|██████████| 1407/1407 [05:53<00:00,  3.98it/s]


Epoch [3/3]


 11%|█▏        | 160/1407 [00:43<58:15,  2.80s/it]

Epoch: 3, Batch: 160, Iter: 2400, Train Loss: 0.12, Train Acc: 96.09%, Val Loss: 0.21, Val Acc: 93.61%, Time: 0:10:05 *


 26%|██▌       | 360/1407 [01:33<33:51,  1.94s/it]

Epoch: 3, Batch: 360, Iter: 2600, Train Loss: 0.19, Train Acc: 94.53%, Val Loss: 0.22, Val Acc: 93.38%, Time: 0:10:55 


 40%|███▉      | 560/1407 [02:23<27:08,  1.92s/it]

Epoch: 3, Batch: 560, Iter: 2800, Train Loss: 0.10, Train Acc: 96.09%, Val Loss: 0.21, Val Acc: 93.61%, Time: 0:11:45 


 54%|█████▍    | 760/1407 [03:12<20:44,  1.92s/it]

Epoch: 3, Batch: 760, Iter: 3000, Train Loss: 0.16, Train Acc: 95.31%, Val Loss: 0.23, Val Acc: 93.30%, Time: 0:12:34 


 68%|██████▊   | 960/1407 [04:02<14:21,  1.93s/it]

Epoch: 3, Batch: 960, Iter: 3200, Train Loss: 0.04, Train Acc: 100.00%, Val Loss: 0.25, Val Acc: 93.19%, Time: 0:13:24 


 82%|████████▏ | 1159/1407 [04:51<01:02,  3.97it/s]

Epoch: 3, Batch: 1160, Iter: 3400, Train Loss: 0.06, Train Acc: 98.44%, Val Loss: 0.24, Val Acc: 93.10%, Time: 0:14:13 
No optimization for a long time, auto-stopping...





In [41]:
test(config,model, test_iter)

Test Loss:  0.21, Test Acc: 93.75%
Precision, Recall and F1-Score...
               precision    recall  f1-score   support

      finance     0.9363    0.9120    0.9240      1000
       realty     0.9754    0.9130    0.9432      1000
       stocks     0.8286    0.9430    0.8821      1000
    education     0.9678    0.9630    0.9654      1000
      science     0.9141    0.8940    0.9039      1000
      society     0.9559    0.9320    0.9438      1000
     politics     0.9295    0.9100    0.9197      1000
       sports     0.9831    0.9860    0.9845      1000
         game     0.9448    0.9580    0.9513      1000
entertainment     0.9563    0.9640    0.9602      1000

     accuracy                         0.9375     10000
    macro avg     0.9392    0.9375    0.9378     10000
 weighted avg     0.9392    0.9375    0.9378     10000

Confusion Matrix...
[[912   5  63   1   8   1   8   1   1   0]
 [ 15 913  42   2   3   8   5   2   3   7]
 [ 28   4 943   0  15   0   8   0   1   1]
 [  4   1