In [10]:
import time
import os
import io
from tqdm import tqdm
from qqdm import qqdm, format_str
import pandas as pd
import numpy as np
import random
import pickle
from PIL import Image

from torch.utils.data import Dataset, DataLoader
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision import models

import transformers
from datasets import load_dataset, load_from_disk
from transformers import AutoModel, AutoConfig, AutoTokenizer
from transformers import AdamW

import matplotlib.pyplot as plt
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
    roc_curve,
    auc
)
plt.rcParams['font.sans-serif']=['SimHei']  # 用来正常显示中文标签 
plt.rcParams['axes.unicode_minus']=False  # 用来正常显示负号

# 忽略红色提示
import warnings
warnings.filterwarnings("ignore")

In [2]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

same_seeds(7890)

In [None]:
# 从预训练模型加载分词器
tokenizer = AutoTokenizer.from_pretrained('../models/hfl_rbt6')  # 可以考虑换成'bert-base-chinese'，建议下载到本地使用

# 设置设备为GPU如果可用，否则使用CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [4]:
class TextDataset(Dataset):  # 修改类名
    def __init__(self, data_file):
        # 打开数据文件并加载数据
        with open(data_file, 'rb') as f:
            self.data = pickle.load(f)


    # 返回数据集的大小
    def __len__(self):
        return len(self.data)

    # 获取数据集中的一个样本
    def __getitem__(self, idx):
        # 获取文本和标签
        text = [i for i in self.data[idx]["text"]]
        label = self.data[idx]["label"]

        # 返回文本和标签
        return text, label

# 创建一个数据集实例
dataset = TextDataset('train.pkl')  # 修改类名

# 获取数据集中的第六个样本的tokens和labels
tokens, labels = dataset[5]
# 打印数据集的大小、第六个样本的tokens和labels
len(dataset), tokens, labels

(7531,
 ['不',
  '要',
  '再',
  '随',
  '便',
  '买',
  '奶',
  '制',
  '品',
  '给',
  '孩',
  '子',
  '喝',
  '了',
  '❗',
  '️',
  '幼',
  '儿',
  '园',
  '都',
  '发',
  '通',
  '知',
  '了',
  '家',
  '长',
  '们',
  '注',
  '意',
  '：',
  '现',
  '在',
  '得',
  '白',
  '血',
  '病',
  '的',
  '小',
  '孩',
  '越',
  '来',
  '越',
  '多',
  '，',
  '妇',
  '幼',
  '保',
  '健',
  '院',
  '提',
  '示',
  '您',
  '，',
  '请',
  '不',
  '要',
  '给',
  '宝',
  '宝',
  '喝',
  '爽',
  '歪',
  '歪',
  '和',
  '有',
  '添',
  '加',
  '剂',
  '的',
  '牛',
  '奶',
  '饮',
  '料',
  '，',
  '告',
  '诉',
  '家',
  '里',
  '有',
  '小',
  '孩',
  '的',
  '朋',
  '友',
  '，',
  '旺',
  '仔',
  '牛',
  '奶',
  '、',
  '可',
  '口',
  '可',
  '乐',
  '、',
  '爽',
  '歪',
  '歪',
  '、',
  '娃',
  '哈',
  '哈',
  'A',
  'D',
  '钙',
  '奶',
  '、',
  '未',
  '来',
  '星',
  '、',
  'Q',
  'Q',
  '星',
  '、',
  '美',
  '汁',
  '源',
  '果',
  '粒',
  '奶',
  '优',
  '菠',
  '萝',
  '味',
  '的',
  '。',
  '都',
  '含',
  '有',
  '肉',
  '毒',
  '杆',
  '菌',
  '。',
  '现',
  '在',
  '紧',
  '急',
  '召',
  '回',
  '。']

In [5]:
# 定义collate_fn函数，用于在数据加载过程中对批次数据进行处理
def collate_fn(batch):
    batch_texts = []  # 创建一个列表用于存储批次中的文本数据
    batch_labels = []  # 创建一个列表用于存储批次中的标签数据

    # 遍历批次中的每个样本
    for item in batch:
        batch_texts.append(item[0])  # 添加文本数据到batch_texts
        batch_labels.append(item[1])  # 添加标签数据到batch_labels

    # 使用tokenizer对文本数据进行预处理
    batch_texts = tokenizer.batch_encode_plus(batch_texts,
                                              truncation=True,
                                              padding=True,
                                              return_tensors='pt',
                                              is_split_into_words=True,
                                              max_length=512-2
                                             )

    # 将标签数据转换为张量
    batch_labels = torch.tensor(batch_labels)

    # 将处理后的文本、图像和标签数据转移到指定的设备上（GPU或CPU）
    return batch_texts.to(device), batch_labels.to(device)

loader = DataLoader(dataset=dataset,
                    batch_size=16,  # 指定批次大小
                    collate_fn=collate_fn,  # 指定collate_fn函数
                    shuffle=True,  # 数据打乱
                    drop_last=True)  # 如果最后一个批次不足batch_size，则丢弃

# 遍历数据加载器，打印每个批次的数据
i = 0
for data in loader:
    print(data)  # 打印批次数据
    i += 1
    if i == 1:  # 如果已经遍历了一个批次，则停止
        break

({'input_ids': tensor([[ 101, 1453, 3314,  ...,    0,    0,    0],
        [ 101, 1333,  889,  ...,    0,    0,    0],
        [ 101,  523, 5401,  ...,    0,    0,    0],
        ...,
        [ 101,  523,  700,  ...,    0,    0,    0],
        [ 101, 7028, 2412,  ...,    0,    0,    0],
        [ 101,  108, 1921,  ...,    0,    0,    0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}, tensor([1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0], device='cuda:0'))


In [6]:
class TextClassifier(nn.Module):  # 修改类名
    def __init__(self, text_model_path):  # 修改参数
        super(TextClassifier, self).__init__()  # 修改类名
        # 加载预训练的文本模型
        self.text_model = AutoModel.from_pretrained(text_model_path)

        # 从预训练的文本模型配置中获取隐藏层的大小
        config = AutoConfig.from_pretrained(text_model_path)

        # 定义一个分类头，它包含线性层和激活函数，用于最终的分类任务
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 512),  # 第一个线性层, 修改输入维度
            nn.ReLU(),  # 激活函数
            nn.Linear(512, 2),  # 第二个线性层，输出维度为2，对应两个分类标签
            nn.Softmax(dim=1)  # Softmax函数，用于计算分类概率
        )

    # 定义前向传播过程
    def forward(self, input_ids, attention_mask):  # 修改参数
        # 提取文本特征，特别是CLS标记的特征，它通常用于分类任务
        text_features = self.text_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        text_features = text_features[:, 0, :]  # 取出CLS标记的特征

        # 使用分类头对特征进行分类
        logits = self.classifier(text_features)  # 修改输入

        # 返回分类结果
        return logits

In [None]:
num_epochs = 50
best_loss = 99
model_path = "..\models"
model = TextClassifier('../models/hfl_rbt6').to(device)  # 修改类名和参数

criterion = nn.CrossEntropyLoss() # 交叉熵损失
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)

In [8]:
test_dataset = TextDataset("test.pkl")  # 修改类名
test_loader = DataLoader(dataset=test_dataset,
                batch_size=16,
                collate_fn=collate_fn,
                shuffle=True,
                drop_last=True)  # 如果最后一个批次不足batch_size，则丢弃

In [None]:
train_metrics = []
test_metrics = []

best_test_loss = float('inf')  # 初始化loss为无穷大

for epoch in range(num_epochs):
    # 训练部分
    model.train()
    train_preds = []
    train_labels = []
    tot_loss = list()

    qqdm_loader = qqdm(loader, desc=format_str('bold', f'Training Epoch {epoch+1}'))
    for batch in qqdm_loader:
        batch_texts, batch_labels = batch  # 修改变量
        optimizer.zero_grad()
        outputs = model(batch_texts['input_ids'], batch_texts['attention_mask'])  # 修改输入
        loss = criterion(outputs, batch_labels)
        tot_loss.append(loss.item())
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        train_preds.extend(predicted.cpu().numpy())
        train_labels.extend(batch_labels.cpu().numpy())

        # 更新qqdm进度条信息
        qqdm_loader.set_infos({
            'loss': f'{np.mean(tot_loss):.4f}',
        })

    mean_loss = np.mean(tot_loss)


    # 计算训练集指标
    train_metrics.append({
        'epoch': epoch + 1,
        'train_loss': mean_loss,
        'train_acc': accuracy_score(train_labels, train_preds),
        'train_precision': precision_score(train_labels, train_preds),
        'train_recall': recall_score(train_labels, train_preds),
        'train_f1': f1_score(train_labels, train_preds),
        'train_mAP': average_precision_score(train_labels, train_preds)
    })

    # 测试部分
    model.eval()
    test_preds = []
    test_labels = []
    test_loss = 0

    qqdm_test_loader = qqdm(test_loader, desc=format_str('bold', f'Testing Epoch {epoch+1}'))
    with torch.no_grad():
        for batch in qqdm_test_loader:
            batch_texts, batch_labels = batch  # 修改变量
            outputs = model(batch_texts['input_ids'], batch_texts['attention_mask'])  # 修改输入
            loss = criterion(outputs, batch_labels)
            test_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            test_preds.extend(predicted.cpu().numpy())
            test_labels.extend(batch_labels.cpu().numpy())

            qqdm_test_loader.set_infos({
                'loss': f'{test_loss/(qqdm_test_loader.n+1):.4f}',
            })
    
    epoch_test_loss = test_loss / len(test_loader)

    # 计算测试集指标
    test_metrics.append({
        'epoch': epoch + 1,
        'test_loss': epoch_test_loss,
        'test_acc': accuracy_score(test_labels, test_preds),
        'test_precision': precision_score(test_labels, test_preds),
        'test_recall': recall_score(test_labels, test_preds),
        'test_f1': f1_score(test_labels, test_preds),
        'test_mAP': average_precision_score(test_labels, test_preds)
    })

    # 根据测试集loss保存最佳模型
    if epoch_test_loss < best_test_loss:
        best_test_loss = epoch_test_loss
        torch.save(model, os.path.join(model_path, 'FN_best_text.pth'))

# 保存指标到CSV
pd.DataFrame(train_metrics).to_csv('train_metrics_text.csv', index=False)
pd.DataFrame(test_metrics).to_csv('test_metrics_text.csv', index=False)

 [1mIters[0m    [1mElapsed Time[0m      [1mSpeed[0m                                                                                       
 [99m0/[93m470[0m[0m  [99m        -        [0m  [99m   -    [0m                                                                                     
[1mTraining Epoch 1[0m   0.0% |                                                                                              |[K[F[K[F [1mIters[0m    [1mElapsed Time[0m      [1mSpeed[0m     [1mloss[0m                                                                              
 [99m1/[93m470[0m[0m  [99m00:00:00<[93m00:03:08[0m[0m  [99m2.48it/s[0m  [99m0.7093[0m                                                                             
[1mTraining Epoch 1[0m   0.2% |                                                                                              |[K[F[K[F [1mIters[0m    [1mElapsed Time[0m      [1mSpeed[0m     [1mloss[0m                