<a href="https://colab.research.google.com/github/AlvinScrp/d2l-zh-pytorch-colab/blob/main/homework/toxic-comment-classification-challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
将BERT集成到您现有的训练代码中 - ModelScope版本
使用ModelScope BERT模型进行多标签分类
"""

import torch
import torch.nn as nn
from torch.optim import AdamW
from transformers import DistilBertModel, BertTokenizer
import pandas as pd
import os
import numpy
print(f"PyTorch版本: {torch.__version__}")
print(f"NumPy: {numpy.__version__}")


# 工具类
class Accumulator:
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [0.0] * len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]
class Timer:
    def __init__(self):
        import time
        self.time = time
        self.start_time = self.time.time()
    def stop(self):
        return self.time.time() - self.start_time

def try_all_gpus():
    """检测可用GPU"""
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# ============ 使用DistilBERT (推荐!) ============
class DistilBERTSentimentClassifier(nn.Module):
    """使用DistilBERT - 多标签分类版本"""
    def __init__(self, num_classes=6, dropout=0.1):  # 改为6个类别
        super(DistilBERTSentimentClassifier, self).__init__()
        from transformers import DistilBertModel

        # DistilBERT: 66M参数 vs BERT: 110M参数
        self.bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

        # 更激进的冻结策略 - 只训练最后1层和分类器
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        for layer in self.bert.transformer.layer[:-1]:  # 冻结除最后一层外的所有层
            for param in layer.parameters():
                param.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        # 输出6个独立的sigmoid值（每个标签一个）
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT没有pooler_output, 使用第一个token
        pooled_output = outputs.last_hidden_state[:, 0]
        # 返回原始logits，不经过sigmoid（留给损失函数处理）
        return self.classifier(self.dropout(pooled_output))


# ============ 修改后的训练函数 ============
def train_bert_epoch(net, train_iter, loss, updater, device):
    """
    单个epoch训练 - 添加混合精度训练
    """
    net.train()
    metric = Accumulator(3)  # 训练损失总和, 准确数, 样本数

    # 使用混合精度训练
    scaler = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None

    for _, batch in enumerate(train_iter):
        # 解析batch
        if len(batch) == 3:
            input_ids, attention_mask, labels = batch
        else:
            input_ids, attention_mask, labels = batch

        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        # 混合精度前向传播
        if scaler is not None:
            with torch.cuda.amp.autocast():
                y_hat = net(input_ids, attention_mask)
                l = loss(y_hat, labels)
        else:
            y_hat = net(input_ids, attention_mask)
            l = loss(y_hat, labels)

        # 反向传播
        if isinstance(updater, torch.optim.Optimizer):
            updater.zero_grad()
            if scaler is not None:
                scaler.scale(l.sum()).backward()
                scaler.unscale_(updater)
                torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
                scaler.step(updater)
                scaler.update()
            else:
                l.sum().backward()
                torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
                updater.step()

        # 统计
        with torch.no_grad():
            acc = multilabel_accuracy(y_hat, labels)
            metric.add(l.sum(), acc * labels.shape[0], labels.shape[0])

    return metric[0] / metric[2], metric[1] / metric[2]


# 训练函数
def multilabel_accuracy(y_hat, y):
    predictions = torch.sigmoid(y_hat) > 0.5
    y = y.bool()
    label_wise_acc = (predictions == y).float().mean()
    return label_wise_acc.item()
def train_bert_epoch(net, train_iter, loss, updater, device):
    net.train()
    metric = Accumulator(3)
    for _, batch in enumerate(train_iter):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        y_hat = net(input_ids, attention_mask)
        l = loss(y_hat, labels)
        updater.zero_grad()
        l.sum().backward()
        torch.nn.utils.clip_grad_norm_(net.parameters(),max_norm=1.0)
        updater.step()
        with torch.no_grad():
            acc = multilabel_accuracy(y_hat, labels)
            metric.add(l.sum(), acc * labels.shape[0],labels.shape[0])
    return metric[0] / metric[2], metric[1] / metric[2]

def evaluate_bert_accuracy(net, data_iter, device):
    net.eval()
    metric = Accumulator(2)
    with torch.no_grad():
        for batch in data_iter:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)
            y_hat = net(input_ids, attention_mask)
            acc = multilabel_accuracy(y_hat, labels)
            metric.add(acc * labels.shape[0], labels.shape[0])
    return metric[0] / metric[1]

def train_bert_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices, scheduler=None):
    """
    完整训练流程
    """
    print('training on', devices)

    if isinstance(devices, list) and len(devices) > 1:
        # 多GPU
        net = nn.DataParallel(net, device_ids=devices)

    device = devices[0] if isinstance(devices, list) else devices
    net = net.to(device)

    timer = Timer()

    for epoch in range(num_epochs):
        # 训练
        train_loss, train_acc = train_bert_epoch(
            net, train_iter, loss, trainer, device
        )

        # 验证
        test_acc = evaluate_bert_accuracy(net, test_iter, device)

        # 学习率调度
        if scheduler is not None:
            scheduler.step()

        print(f'Epoch {epoch + 1}: '
              f'loss {train_loss:.3f}, '
              f'train acc {train_acc:.3f}, '
              f'test acc {test_acc:.3f}')

    print(f'Training completed in {timer.stop():.1f} sec')
    print(f'Final: train acc {train_acc:.3f}, test acc {test_acc:.3f}')


def read_toxic_comments_real(data_dir, max_samples=None, is_train=True):
    """
    读取真实的Kaggle Toxic Comment Classification数据
    返回格式: (texts, labels, ids)
    """
    import pandas as pd
    import os

    if is_train:
        csv_path = os.path.join(data_dir, 'train.csv')
        print(f"读取训练数据: {csv_path}")

        df = pd.read_csv(csv_path)
        if max_samples:
            df = df.head(max_samples)

        texts = df['comment_text'].tolist()
        label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        labels = df[label_columns].values.tolist()
        ids = df['id'].tolist()

        print(f"加载训练数据: {len(texts)} 条")
        print(f"标签分布: {dict(zip(label_columns, df[label_columns].sum().tolist()))}")

        return texts, labels, ids
    else:
        csv_path = os.path.join(data_dir, 'test.csv')
        print(f"读取测试数据: {csv_path}")

        df = pd.read_csv(csv_path)
        if max_samples:
            df = df.head(max_samples)

        texts = df['comment_text'].tolist()
        ids = df['id'].tolist()

        print(f"加载测试数据: {len(texts)} 条")

        return texts, None, ids

def generate_submission(model, test_loader, device, test_ids, output_path):
    """
    生成Kaggle提交文件
    """
    import pandas as pd

    model.eval()
    predictions = []

    print("🔮 生成预测结果...")
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, _ = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)

            # 获取logits并转换为概率
            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits).cpu().numpy()
            predictions.extend(probs)

    # 创建提交DataFrame
    label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    submission_df = pd.DataFrame({
        'id': test_ids,
        **{col: [pred[i] for pred in predictions] for i, col in enumerate(label_columns)}
    })

    # 保存提交文件
    submission_df.to_csv(output_path, index=False)
    print(f"💾 提交文件已保存: {output_path}")
    print(f"📊 预测统计:")
    for i, col in enumerate(label_columns):
        avg_prob = sum(pred[i] for pred in predictions) / len(predictions)
        print(f"  {col}: 平均概率 {avg_prob:.4f}")

    return submission_df

# 数据集
class BERTDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        if labels is not None:
            self.labels = torch.tensor(labels, dtype=torch.float32)
        else:
            self.labels = torch.zeros((len(texts), 6),
dtype=torch.float32)
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return (
            encoding['input_ids'].squeeze(),
            encoding['attention_mask'].squeeze(),
            self.labels[idx]
        )
    def __len__(self):
        return len(self.labels)



In [None]:

# ============ 主要执行代码 ============
print("🚀 启动 BERT多标签分类训练")
# 数据目录
data_dir = 'toxic-comment'
# 数据加载
print("📊 加载真实Kaggle数据...")
# 为了快速训练，限制样本数（可以根据需要调整）
train_texts, train_labels, train_ids = read_toxic_comments_real(
    data_dir, max_samples=2000, is_train=True
)
# 创建验证集（从训练数据中分割）
val_split = int(len(train_texts) * 0.8)
val_texts = train_texts[val_split:]
val_labels = train_labels[val_split:]
train_texts = train_texts[:val_split]
train_labels = train_labels[:val_split]
# 读取测试数据
test_texts, _, test_ids = read_toxic_comments_real(
    data_dir, max_samples=None, is_train=False
)
# 1. 初先初始化BERT tokenizer
# DistilBERT 和 BERT 使用相同的tokenizer，因为 DistilBERT 是从 BERT 蒸馏而来的，保留了相同的词汇表和分词方式。
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_bert_sentiment-analysis_english-base')
print(f"\n📊 数据统计:")
print(f"训练数据: {len(train_texts)} 条")
print(f"验证数据: {len(val_texts)} 条")
print(f"测试数据: {len(test_texts)} 条")
# 检查数据质量
print(f"\n📝 数据样例:")
print(f"文本长度: {len(train_texts[0])}")
print(f"前100字符: {train_texts[0][:100]}")
print(f"标签: {train_labels[0]}")
# 检查tokenizer
sample_encoding = tokenizer(train_texts[0], max_length=128, truncation=True, padding='max_length', return_tensors='pt')
print(f"\n🔧 Tokenizer测试:")
print(f"input_ids shape: {sample_encoding['input_ids'].shape}")
print(f"attention_mask shape: {sample_encoding['attention_mask'].shape}")
print(f"实际token数: {sample_encoding['attention_mask'].sum()}")
# 模型参数
num_classes = 6  # 6个类别：toxic, severe_toxic, obscene, threat, insult, identity_hate
dropout = 0.1
batch_size = 16  # 适配在线环境
num_steps = 32  # 序列长度
lr = 1e-4  # 学习率
num_epochs = 3  # 增加到3个epoch获得更好效果
 # 使用ModelScope上实际存在的BERT模型

# 创建模型
net = DistilBERTSentimentClassifier(
    num_classes=num_classes,
    dropout=dropout
)

# 创建数据加载器
train_dataset = BERTDataset(train_texts, train_labels, tokenizer, max_length=num_steps)
val_dataset = BERTDataset(val_texts, val_labels, tokenizer, max_length=num_steps)
test_dataset = BERTDataset(test_texts, None, tokenizer, max_length=num_steps)
train_iter = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  num_workers=2, pin_memory=False)
val_iter = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=False)
test_iter = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=False)
# 优化器 - 只训练最后一层和分类器
trainer = AdamW([
    {'params': [p for name, p in net.named_parameters() if 'classifier' in name], 'lr': lr * 10},  # 分类层
    {'params': [p for name, p in net.named_parameters() if 'classifier' not in name and p.requires_grad], 'lr': lr}  # BERT层
], weight_decay=0.01)
# 损失函数 - 多标签分类使用BCEWithLogitsLoss
loss = nn.BCEWithLogitsLoss(reduction="none")  # 每个样本每个标签独立计算
# 训练
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔥 使用设备: {device}")
# 开始训练
train_bert_ch13(net, train_iter, val_iter, loss, trainer, num_epochs, device, None)
print("\n" + "="*60)
print("🎉 ModelScope BERT训练完成!")
print("="*60)
# print(f"✅ 模型:{model_name}")
print(f"✅ 训练样本: {len(train_texts)}")
print(f"✅ 验证样本: {len(val_texts)}")
print(f"✅ 测试样本: {len(test_texts)}")
print("✅ 支持多标签分类")
print("✅ 混合精度训练")
print("="*60)