使用翻譯後的英文文章 fine-tune BERT, RoBERTa,和 POLITICS

# 01. 環境安裝（Kaggle GPU, CUDA 12.1 相容）

In [1]:
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers datasets scikit-learn accelerate evaluate torchmetrics
!pip install -q sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m780.5/780.5 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.6/121.6 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.5/56.5 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━

# 02. 套件匯入與全域參數

In [2]:
import os, json, random, math, time
import numpy as np
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, field
from typing import List, Dict, Optional

import torch, torch.nn as nn
from torch.utils.data import Dataset
from torch.optim import AdamW
from torch.nn.utils.rnn import pad_sequence
from torchmetrics.classification import MulticlassF1Score, MulticlassAccuracy, MulticlassPrecision, MulticlassRecall

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding,
    logging as hf_logging
)
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import confusion_matrix, roc_auc_score

from accelerate import Accelerator, DistributedDataParallelKwargs
hf_logging.set_verbosity_error()

SEED = 42
torch.manual_seed(SEED)
random.seed(SEED)
np.random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on {device}")
OUTPUT_DIR = Path("/kaggle/working")
OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

2025-05-31 05:09:13.771470: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748668153.952048      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748668154.007630      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Running on cuda


# 03. 公用函式

In [3]:
def set_seed(seed: int = 42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

def metric_dict(num_classes: int = 3):
    return {
        "acc": MulticlassAccuracy(num_classes=num_classes, average="micro").to(device),
        "f1_macro": MulticlassF1Score(num_classes=num_classes, average="macro").to(device),
        "f1_weighted": MulticlassF1Score(num_classes=num_classes, average="weighted").to(device),
        "prec_macro": MulticlassPrecision(num_classes=num_classes, average="macro").to(device),
        "recall_macro": MulticlassRecall(num_classes=num_classes, average="macro").to(device),
    }

def update_metrics(metrics, preds, labels):
    for m in metrics.values():
        m.update(preds, labels)

def compute_metrics(metrics):
    return {k: float(m.compute().cpu()) for k, m in metrics.items()}

def reset_metrics(metrics):
    for m in metrics.values():
        m.reset()

# 04. Dataset 與 Batch 組裝

In [4]:
class NewsDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256, use_content=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.use_content = use_content

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        if self.use_content:
            text = f"{row['title_en']} [SEP] {row['content_en']}"
        else:
            text = row["title_en"]
        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
        )
        item = {k: torch.tensor(v) for k, v in encoding.items()}
        item["labels"] = torch.tensor(row["label_encoded"], dtype=torch.long)
        return item

# 05. 模型定義：BERT 

In [5]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification

class BertClassifier(nn.Module):
    def __init__(self, model_name: str, num_labels: int = 3, dropout: float = 0.1):
        """
        A plain BERT-based classifier *without* triplet loss.
        """
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,   # 仍然保留，方便處理各種 BERT family
        labels: torch.Tensor = None,
        **kwargs,              # 其餘欄位丟進來也 OK，不會報錯
    ):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )
        pooled = outputs.last_hidden_state[:, 0]          # CLS
        logits = self.classifier(self.dropout(pooled))

        loss = None
        if labels is not None:
            loss = nn.functional.cross_entropy(logits, labels)

        return {"logits": logits, "loss": loss}

# 06. 訓練／驗證迴圈（單 fold、單設定）

In [6]:
def train_one_fold(
    fold_id: int,
    train_df: pd.DataFrame,
    val_df: pd.DataFrame,
    cfg: Dict,
    class_weights: np.ndarray,
):
    set_seed(SEED + fold_id)
    tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
    train_ds = NewsDataset(train_df, tokenizer, max_len=cfg["max_len"], use_content=cfg["use_content"])
    val_ds   = NewsDataset(val_df, tokenizer, max_len=cfg["max_len"], use_content=cfg["use_content"])

    collator = DataCollatorWithPadding(tokenizer, return_tensors="pt")
    train_loader = torch.utils.data.DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True, collate_fn=collator)
    val_loader   = torch.utils.data.DataLoader(val_ds,   batch_size=cfg["batch_size"], shuffle=False, collate_fn=collator)

    model = AutoModelForSequenceClassification.from_pretrained(
        cfg["model_name"],
        num_labels=3
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
    total_steps = cfg["epochs"] * len(train_loader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(cfg["warmup_ratio"] * total_steps),
        num_training_steps=total_steps,
    )

    class_w = torch.tensor(class_weights, dtype=torch.float).to(device)
    loss_fn = nn.CrossEntropyLoss(weight=class_w)

    accelerator = Accelerator(
        gradient_accumulation_steps=1,
        kwargs_handlers=[DistributedDataParallelKwargs(find_unused_parameters=True)],
    )
    model, optimizer, train_loader, val_loader, scheduler = accelerator.prepare(
        model, optimizer, train_loader, val_loader, scheduler
    )

    best_f1 = -1
    best_state = None
    patience_cnt = 0
    metrics_obj = metric_dict()
    for epoch in range(cfg["epochs"]):
        # --- training ---
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(**batch)
            loss = outputs["loss"]
            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()

        # --- validation ---
        model.eval()
        reset_metrics(metrics_obj)
        val_losses = []
        for batch in val_loader:
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs["logits"]
            val_losses.append(outputs["loss"].item())
            preds = torch.argmax(logits, dim=-1)
            update_metrics(metrics_obj, preds, batch["labels"])

        val_scores = compute_metrics(metrics_obj)
        val_scores["loss"] = float(np.mean(val_losses))
        if val_scores["f1_macro"] > best_f1:
            best_f1 = val_scores["f1_macro"]
            best_state = accelerator.get_state_dict(model)
            patience_cnt = 0
        else:
            patience_cnt += 1
            if patience_cnt >= cfg["patience"]:
                break  # early stop

    # 儲存 best checkpoint
    ckpt_path = OUTPUT_DIR / f"fold{fold_id}_{cfg['name']}.pt"
    torch.save(best_state, ckpt_path)
    return best_f1, val_scores

# 07. 主要流程：讀檔、設定 4 組試驗、5-fold CV


In [7]:
DATA_PATH = '/kaggle/input/taiwan-political-news-dataset/news_training_with_translations.csv'
df = pd.read_csv(DATA_PATH)
print(f"CSV loaded: {df.shape}")

# --- 配置列表 ---
cfg_list = []
model_names = {
    "bert-base":   "google-bert/bert-base-uncased",
    "roberta-base":"roberta-base",
    "POLITICS":"launch/POLITICS",
}
for mkey, mname in model_names.items():
    for use_content in [False, True]:
        tag = f"{mkey}_{'title+content' if use_content else 'title'}"
        cfg_list.append(
            dict(
                name=tag,
                model_name=mname,
                use_content=use_content,
                max_len=512 if use_content else 128,
                batch_size=16,
                epochs=5,
                lr=2e-5,
                weight_decay=0.01,
                warmup_ratio=0.1,
                dropout=0.1,
                patience=2,
            )
        )
results = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for fold_id, (train_idx, val_idx) in enumerate(skf.split(df, df["label_encoded"])):
    train_df, val_df = df.iloc[train_idx], df.iloc[val_idx]
    class_w = compute_class_weight("balanced", classes=[0,1,2], y=train_df["label_encoded"])
    for cfg in cfg_list:
        start = time.time()
        best_f1, val_scores = train_one_fold(fold_id, train_df, val_df, cfg, class_w)
        duration = time.time() - start
        record = {
            "fold": fold_id,
            "exp": cfg["name"],
            "best_macro_f1": best_f1,
            "time_sec": round(duration, 1),
            **val_scores,
        }
        results.append(record)
        print(f"[Fold {fold_id}] {cfg['name']}: F1={best_f1:.4f} ({duration/60:.1f} min)")

# 保存所有結果
res_df = pd.DataFrame(results)
res_path = OUTPUT_DIR / "results.csv"
res_df.to_csv(res_path, index=False)
print(f"\nAll results saved to {res_path}")

CSV loaded: (3166, 8)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[Fold 0] bert-base_title: F1=0.6447 (1.4 min)
[Fold 0] bert-base_title+content: F1=0.7343 (12.9 min)


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

[Fold 0] roberta-base_title: F1=0.6470 (1.6 min)
[Fold 0] roberta-base_title+content: F1=0.7727 (12.9 min)


tokenizer_config.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/672 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

[Fold 0] POLITICS_title: F1=0.6585 (1.6 min)
[Fold 0] POLITICS_title+content: F1=0.7689 (12.9 min)
[Fold 1] bert-base_title: F1=0.6559 (1.4 min)
[Fold 1] bert-base_title+content: F1=0.7644 (12.9 min)
[Fold 1] roberta-base_title: F1=0.6586 (1.5 min)
[Fold 1] roberta-base_title+content: F1=0.7946 (12.9 min)
[Fold 1] POLITICS_title: F1=0.6380 (1.5 min)
[Fold 1] POLITICS_title+content: F1=0.7770 (13.0 min)
[Fold 2] bert-base_title: F1=0.6529 (1.4 min)
[Fold 2] bert-base_title+content: F1=0.7244 (13.0 min)
[Fold 2] roberta-base_title: F1=0.6451 (1.5 min)
[Fold 2] roberta-base_title+content: F1=0.7643 (12.9 min)
[Fold 2] POLITICS_title: F1=0.6360 (1.5 min)
[Fold 2] POLITICS_title+content: F1=0.7779 (13.0 min)
[Fold 3] bert-base_title: F1=0.6352 (1.4 min)
[Fold 3] bert-base_title+content: F1=0.7039 (13.0 min)
[Fold 3] roberta-base_title: F1=0.6684 (1.5 min)
[Fold 3] roberta-base_title+content: F1=0.7391 (13.0 min)
[Fold 3] POLITICS_title: F1=0.6516 (1.6 min)
[Fold 3] POLITICS_title+content: F

# 08. 產生最佳模型摘要

In [8]:
best_row = res_df.sort_values("best_macro_f1", ascending=False).iloc[0]
summary = {
    "best_exp": best_row["exp"],
    "best_macro_f1": best_row["best_macro_f1"],
    "fold": int(best_row["fold"]),
}
with open(OUTPUT_DIR / "best_summary.md", "w") as f:
    f.write(f"# Best Experiment\n\n")
    f.write(json.dumps(summary, indent=2, ensure_ascii=False))
print("=== BEST MODEL SUMMARY ===")
print(summary)

=== BEST MODEL SUMMARY ===
{'best_exp': 'roberta-base_title+content', 'best_macro_f1': 0.7945559024810791, 'fold': 1}
