<a href="https://colab.research.google.com/github/AlvinScrp/d2l-zh-pytorch-colab/blob/main/homework/toxic-comment-classification-challenge_gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import urllib.request
from pathlib import Path

# === 1. 全局配置 ===
URLPrefix = "https://pro-5gu0t2os8cdd45f2-1251420592.tcloudbaseapp.com/toxic-comment-classification"
DATA_DIR = Path("toxic-comment")
FILENAMES = ["train.csv","test.csv","test_labels.csv","sample_submission.csv"]

BATCH_SIZE = 8
RANDOM_STATE = 123
NUM_WORKERS = 2


# === 2. 数据准备 ===
def prepare_csv_list():
    # 如果toxic-comment 不存在，创建该目录
    if not DATA_DIR.exists():
        DATA_DIR.mkdir(parents=True, exist_ok=True)

    for fileName in FILENAMES:
        URL = f"{URLPrefix}/{fileName}"
        DATA_FILE =DATA_DIR/fileName
        if not DATA_FILE.exists():
            print(f"⬇️ Downloading {fileName}...")
            with urllib.request.urlopen(URL) as r, open(DATA_FILE, "wb") as f:
                f.write(r.read())
        else:
            print(f"✅ already exists: {fileName} ")

prepare_csv_list()

✅ Dataset already exists.
✅ Dataset already exists.
✅ Dataset already exists.
✅ Dataset already exists.


## 模型GPT2ClassificationModel

In [6]:
from transformers import GPT2Tokenizer, GPT2Model
class GPT2ClassificationModel(nn.Module):
  def __init__(self,num_labels = 6):
    super().__init__()
    self.gpt2 = GPT2Model.from_pretrained('gpt2')
    config = self.gpt2.config
    self.classifier = nn.Linear(config.hidden_size, num_labels, bias=True)

  def forward(self,input_ids,attention_mask):
    gpt2_out = self.gpt2(input_ids,attention_mask=attention_mask)
    logits = self.classifier(gpt2_out.last_hidden_state[:, -1, :])
    return logits

In [7]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Kaggle Toxic Comment Classification - gpt2 版本 多标签文本分类
"""

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
from transformers import GPT2Tokenizer, GPT2Model
from collections import Counter
import re
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print(f"PyTorch版本: {torch.__version__}")
print(f"NumPy: {np.__version__}")

# 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'🔥 使用设备: {device}')

# 工具类
class Accumulator:
    def __init__(self, n):
        self.data = [0.0] * n
    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]
    def reset(self):
        self.data = [0.0] * len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

class Timer:
    def __init__(self):
        import time
        self.time = time
        self.start_time = self.time.time()
    def stop(self):
        return self.time.time() - self.start_time

def try_all_gpus():
    """检测可用GPU"""
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')



class TextPreprocessor:
    """文本预处理和词汇表构建器"""

    def __init__(self, max_vocab_size=50000, max_seq_length=128, min_freq=2):
        self.max_vocab_size = max_vocab_size
        self.max_seq_length = max_seq_length
        self.min_freq = min_freq
        self.vocab = None
        self.word_to_idx = None
        self.idx_to_word = None

    def clean_text(self, text):
        """清理文本"""
        if pd.isna(text):
            return ""

        # 转换为小写
        text = str(text).lower()

        # 移除特殊字符，保留字母数字和基本标点
        text = re.sub(r'[^a-zA-Z0-9\s\.\,\!\?\;\:]', ' ', text)

        # 移除多余空格
        text = ' '.join(text.split())

        return text

    def build_vocab(self, texts):
        """构建词汇表"""
        print("📝 构建词汇表...")

        # 清理文本并分词
        word_counts = Counter()
        for text in tqdm(texts, desc="处理文本"):
            cleaned_text = self.clean_text(text)
            words = cleaned_text.split()
            word_counts.update(words)

        # 构建词汇表：保留高频词
        vocab_items = [word for word, count in word_counts.most_common(self.max_vocab_size-2)
                      if count >= self.min_freq]

        # 添加特殊标记
        self.vocab = ['<PAD>', '<UNK>'] + vocab_items
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}

        print(f"词汇表大小: {len(self.vocab)}")
        return self.vocab

    def text_to_sequence(self, text):
        """文本转换为序列"""
        cleaned_text = self.clean_text(text)
        words = cleaned_text.split()

        # 转换为索引
        indices = []
        for word in words[:self.max_seq_length]:
            idx = self.word_to_idx.get(word, 1)  # 1 是 <UNK> 的索引
            indices.append(idx)

        # 填充到固定长度
        if len(indices) < self.max_seq_length:
            indices.extend([0] * (self.max_seq_length - len(indices)))

        return indices[:self.max_seq_length]

class ToxicCommentDataset(Dataset):
    """有毒评论数据集"""

    def __init__(self, texts, labels, preprocessor):
        self.texts = texts
        self.labels = labels if labels is not None else [[0]*6]*len(texts)
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        sequence = self.preprocessor.text_to_sequence(text)

        # 创建attention mask（非零位置为1）
        attention_mask = [1 if token != 0 else 0 for token in sequence]

        return (
            torch.tensor(sequence, dtype=torch.long),
            torch.tensor(attention_mask, dtype=torch.long),
            torch.tensor(self.labels[idx], dtype=torch.float)
        )

def multilabel_accuracy(y_hat, y):
    """多标签分类准确率"""
    predictions = torch.sigmoid(y_hat) > 0.5
    y = y.bool()
    label_wise_acc = (predictions == y).float().mean()
    return label_wise_acc.item()

def train_gpt2_epoch(net, train_iter, loss, updater, device, scheduler=None):
    """
    单个epoch训练 - 混合精度训练 + 学习率调度
    """
    net.train()
    metric = Accumulator(3)  # 训练损失总和, 准确数, 样本数

    # 使用混合精度训练
    scaler = torch.cuda.amp.GradScaler() if device.type == 'cuda' else None

    for _, batch in enumerate(train_iter):
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device, non_blocking=True)
        attention_mask = attention_mask.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # 混合精度前向传播
        if scaler is not None:
            with torch.cuda.amp.autocast():
                y_hat = net(input_ids, attention_mask)
                l = loss(y_hat, labels)
        else:
            y_hat = net(input_ids, attention_mask)
            l = loss(y_hat, labels)

        updater.zero_grad()

        # 混合精度反向传播
        if scaler is not None:
            scaler.scale(l.sum()).backward()
            scaler.unscale_(updater)
            torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
            scaler.step(updater)
            scaler.update()
        else:
            l.sum().backward()
            torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=1.0)
            updater.step()

        # 学习率调度（OneCycleLR需要在每个batch后调用）
        if scheduler is not None:
            scheduler.step()

        with torch.no_grad():
            acc = multilabel_accuracy(y_hat, labels)
            metric.add(l.sum(), acc * labels.shape[0], labels.shape[0])

    return metric[0] / metric[2], metric[1] / metric[2]

def evaluate_gpt2_accuracy(net, data_iter, device):
    net.eval()
    metric = Accumulator(2)
    with torch.no_grad():
        for batch in data_iter:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device, non_blocking=True)
            attention_mask = attention_mask.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            # 使用混合精度推理
            if device.type == 'cuda':
                with torch.cuda.amp.autocast():
                    y_hat = net(input_ids, attention_mask)
            else:
                y_hat = net(input_ids, attention_mask)

            acc = multilabel_accuracy(y_hat, labels)
            metric.add(acc * labels.shape[0], labels.shape[0])
    return metric[0] / metric[1]

def train_gpt2_model(net, train_iter, test_iter, loss, trainer, num_epochs, devices, scheduler=None):
    """
    完整训练流程
    """
    print('training on', devices)

    if isinstance(devices, list) and len(devices) > 1:
        # 多GPU
        net = nn.DataParallel(net, device_ids=devices)

    device = devices[0] if isinstance(devices, list) else devices
    net = net.to(device)

    timer = Timer()

    for epoch in range(num_epochs):
        # 训练 - 传递调度器
        train_loss, train_acc = train_gpt2_epoch(
            net, train_iter, loss, trainer, device, scheduler
        )

        # 验证
        test_acc = evaluate_gpt2_accuracy(net, test_iter, device)

        print(f'Epoch {epoch + 1}: '
              f'loss {train_loss:.3f}, '
              f'train acc {train_acc:.3f}, '
              f'test acc {test_acc:.3f}, '
              f'lr {trainer.param_groups[0]["lr"]:.6f}')

    print(f'Training completed in {timer.stop():.1f} sec')
    print(f'Final: train acc {train_acc:.3f}, test acc {test_acc:.3f}')

def read_toxic_comments_real(data_dir, max_samples=None, is_train=True):
    """
    读取真实的Kaggle Toxic Comment Classification数据
    返回格式: (texts, labels, ids)
    """
    if is_train:
        csv_path = os.path.join(data_dir, 'train.csv')
        print(f"读取训练数据: {csv_path}")

        df = pd.read_csv(csv_path)
        if max_samples:
            df = df.head(max_samples)

        texts = df['comment_text'].tolist()
        label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        labels = df[label_columns].values.tolist()
        ids = df['id'].tolist()

        print(f"加载训练数据: {len(texts)} 条")
        print(f"标签分布: {dict(zip(label_columns, df[label_columns].sum().tolist()))}")

        return texts, labels, ids
    else:
        csv_path = os.path.join(data_dir, 'test.csv')
        print(f"读取测试数据: {csv_path}")

        df = pd.read_csv(csv_path)
        if max_samples:
            df = df.head(max_samples)

        texts = df['comment_text'].tolist()
        ids = df['id'].tolist()

        print(f"加载测试数据: {len(texts)} 条")

        return texts, None, ids

def generate_submission(model, test_loader, device, test_ids, output_path):
    """
    生成Kaggle提交文件
    """
    model.eval()
    predictions = []

    print("🔮 生成预测结果...")
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, _ = batch
            input_ids = input_ids.to(device, non_blocking=True)
            attention_mask = attention_mask.to(device, non_blocking=True)

            # 使用混合精度推理
            if device.type == 'cuda':
                with torch.cuda.amp.autocast():
                    logits = model(input_ids, attention_mask)
            else:
                logits = model(input_ids, attention_mask)

            probs = torch.sigmoid(logits).cpu().numpy()
            predictions.extend(probs)

    # 创建提交DataFrame
    label_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    submission_df = pd.DataFrame({
        'id': test_ids,
        **{col: [pred[i] for pred in predictions] for i, col in enumerate(label_columns)}
    })

    # 保存提交文件
    submission_df.to_csv(output_path, index=False)
    print(f"💾 提交文件已保存: {output_path}")
    print(f"📊 预测统计:")
    for i, col in enumerate(label_columns):
        avg_prob = sum(pred[i] for pred in predictions) / len(predictions)
        print(f"  {col}: 平均概率 {avg_prob:.4f}")

    return submission_df

# ============ 主要执行代码 ============
print("🚀 启动双向GPT2多标签分类训练")

# 数据目录
data_dir = 'toxic-comment'
prepare_csv_list()

# 数据加载
print("📊 加载真实Kaggle数据...")

# 为了快速训练，限制样本数（可以根据需要调整）
train_texts, train_labels, train_ids = read_toxic_comments_real(
    data_dir, max_samples=None, is_train=True
)

# 创建验证集（从训练数据中分割）
val_split = int(len(train_texts) * 0.8)
val_texts = train_texts[val_split:]
val_labels = train_labels[val_split:]
train_texts = train_texts[:val_split]
train_labels = train_labels[:val_split]

# 读取测试数据
test_texts, _, test_ids = read_toxic_comments_real(
    data_dir, max_samples=None, is_train=False
)

print(f"\n📊 数据统计:")
print(f"训练数据: {len(train_texts)} 条")
print(f"验证数据: {len(val_texts)} 条")
print(f"测试数据: {len(test_texts)} 条")

# 检查数据质量
print(f"\n📝 数据样例:")
print(f"文本长度: {len(train_texts[0])}")
print(f"前100字符: {train_texts[0][:100]}")
print(f"标签: {train_labels[0]}")

# 创建文本预处理器和词汇表 - 优化参数以提升速度
preprocessor = TextPreprocessor(
    max_vocab_size=20000,  # 从50000降到20000
    max_seq_length=64,     # 从128降到64
    min_freq=3             # 提高最小频率阈值
)
preprocessor.build_vocab(train_texts)

print(f"\n🔧 预处理器测试:")
sample_sequence = preprocessor.text_to_sequence(train_texts[0])
print(f"序列长度: {len(sample_sequence)}")
print(f"非零token数: {sum(1 for x in sample_sequence if x != 0)}")

# 模型参数 - 优化以提升训练速度
num_classes = 6  # 6个类别：toxic, severe_toxic, obscene, threat, insult, identity_hate
dropout = 0.3
# 根据GPU情况自动调整批次大小
batch_size = 32 if torch.cuda.is_available() else 16
num_steps = 128   # 序列长度（从128降到64）
lr = 2e-3        # 提高学习率以加快收敛
num_epochs = 3   # 训练轮数

# 创建双向LSTM模型 - 优化结构以提升速度
net = GPT2ClassificationModel()

print(f"模型参数数量: {sum(p.numel() for p in net.parameters()):,}")

# 创建数据加载器
train_dataset = ToxicCommentDataset(train_texts, train_labels, preprocessor)
val_dataset = ToxicCommentDataset(val_texts, val_labels, preprocessor)
test_dataset = ToxicCommentDataset(test_texts, None, preprocessor)

train_iter = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True,
                                          num_workers=4, pin_memory=True, persistent_workers=True)
val_iter = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                                       num_workers=2, pin_memory=True, persistent_workers=True)
test_iter = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                                        num_workers=2, pin_memory=True, persistent_workers=True)

# 优化器和学习率调度器
trainer = optim.Adam(net.parameters(), lr=lr, weight_decay=0.01)

# 添加学习率调度器以提升训练效果
scheduler = optim.lr_scheduler.OneCycleLR(
    trainer,
    max_lr=lr * 5,  # 最大学习率
    steps_per_epoch=len(train_iter),
    epochs=num_epochs,
    pct_start=0.3  # 前30%时间用于升温
)

# 损失函数 - 多标签分类使用BCEWithLogitsLoss
loss = nn.BCEWithLogitsLoss(reduction="none")  # 每个样本每个标签独立计算

# 训练
print(f"🔥 使用设备: {device}")

# 开始训练 - 使用学习率调度器
train_gpt2_model(net, train_iter, val_iter, loss, trainer, num_epochs, device, scheduler)

print("\n" + "="*60)
print("🎉 Huggingface GPT2训练完成!")
print("="*60)
print(f"✅ 训练样本: {len(train_texts)}")
print(f"✅ 验证样本: {len(val_texts)}")
print(f"✅ 测试样本: {len(test_texts)}")
print("✅ 支持多标签分类")
print("✅ 词汇表大小: {:,}".format(len(preprocessor.vocab)))
print("="*60)

PyTorch版本: 2.8.0+cu126
NumPy: 2.0.2
🔥 使用设备: cpu
🚀 启动双向GPT2多标签分类训练
✅ Dataset already exists.
✅ Dataset already exists.
✅ Dataset already exists.
✅ Dataset already exists.
📊 加载真实Kaggle数据...
读取训练数据: toxic-comment/train.csv
加载训练数据: 159571 条
标签分布: {'toxic': 15294, 'severe_toxic': 1595, 'obscene': 8449, 'threat': 478, 'insult': 7877, 'identity_hate': 1405}
读取测试数据: toxic-comment/test.csv
加载测试数据: 153164 条

📊 数据统计:
训练数据: 127656 条
验证数据: 31915 条
测试数据: 153164 条

📝 数据样例:
文本长度: 264
前100字符: Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't 
标签: [0, 0, 0, 0, 0, 0]
📝 构建词汇表...


处理文本: 100%|██████████| 127656/127656 [00:04<00:00, 28647.83it/s]


词汇表大小: 20000

🔧 预处理器测试:
序列长度: 64
非零token数: 46


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

模型参数数量: 124,444,422
🔥 使用设备: cpu
training on cpu


KeyboardInterrupt: 

In [None]:
# 生成提交文件
submission_path = os.path.join(data_dir, 'submission.csv')
submission_df = generate_submission(net, test_iter, device, test_ids, submission_path)

print("\n🎉 训练和预测完成!")
print(f"✅ 提交文件: {submission_path}")