In [None]:
# =============================
#   0. 安装依赖
# =============================
# https://www.kaggle.com/competitions/nanogpt-fudannlp-cs-30040
!pip install --quiet transformers rouge-score sentencepiece

In [None]:
# =============================
#   1. 导入库
# =============================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.optim import AdamW
from tqdm import tqdm
from rouge_score import rouge_scorer
import os, random


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"使用设备: {device}")

In [None]:
# =============================
#   2. 读取数据
# =============================

train_path = "/kaggle/input/nanogpt-fudannlp-cs-30040/train.csv"
test_path = "/kaggle/input/nanogpt-fudannlp-cs-30040/test.csv"

base_model = "google-t5/t5-base"
local_model = "/kaggle/working/out-summarization/t5"
upload_model = "EthanCao/fudannlp-t5-base"  # HF模型名称
os.makedirs(local_model, exist_ok=True)

run_model = local_model # 选择使用的模型 (三选一)
run_upload = True  # 是否上传模型到HF
run_train = False  # 是否进行训练
run_infer = False  # 是否进行推理

lr = 1e-5 # 学习率
EPOCHS = 1 # 训练轮数
batch_size = 8  # GPU显存限制
accumulation_steps = 4  # 每隔多少步执行梯度更新

df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
# 打印一下训练集摘要的最短最长和平均长度
summary_lengths = df['summary'].apply(lambda x: len(x.split()))
print(f"摘要最短长度: {summary_lengths.min()} 个词")
print(f"摘要最长长度: {summary_lengths.max()} 个词")
print(f"摘要平均长度: {summary_lengths.mean():.2f} 个词")

In [None]:
# =============================
#   3. 划分训练 / 验证集 (90/10)
# =============================
val_ratio = 0.1
val_size = int(len(df) * val_ratio)
train_size = len(df) - val_size

train_df, val_df = random_split(df, [train_size, val_size])
train_df = df.iloc[train_df.indices]
val_df = df.iloc[val_df.indices]

print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(val_df)}")

In [None]:
# =============================
#   4. 定义 Dataset
# =============================
tokenizer = T5Tokenizer.from_pretrained(base_model)


class SummarizationDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        inp = "summarize: " + row["dialogue"]
        tgt = row["summary"]

        enc = tokenizer(inp, truncation=True, padding="max_length", max_length=512, return_tensors="pt")
        dec = tokenizer(tgt, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

        return {
            "input_ids": enc["input_ids"].squeeze(),
            "attention_mask": enc["attention_mask"].squeeze(),
            "labels": dec["input_ids"].squeeze(),
        }


train_ds = SummarizationDataset(train_df)
val_ds = SummarizationDataset(val_df)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size)

In [None]:
# =============================
#   5. 上传模型到 Hugging Face
# =============================

if run_upload:
    print(f"Uploading model to Hugging Face: {run_model} -> {upload_model}")
    model = T5ForConditionalGeneration.from_pretrained(run_model).to(device)
    model.push_to_hub(upload_model)

In [None]:
scorer = rouge_scorer.RougeScorer(["rouge1", "rouge2", "rougeL"], use_stemmer=True)


def print_sample_rouge():
    sample = val_df.sample(10)
    r1 = r2 = rL = 0

    for _, row in sample.iterrows():
        text = "summarize: " + row["dialogue"]
        ref = row["summary"]

        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
        pred_ids = model.generate(**inputs, max_length=128)
        pred = tokenizer.decode(pred_ids[0], skip_special_tokens=True)

        s = scorer.score(ref, pred)
        r1 += s["rouge1"].fmeasure
        r2 += s["rouge2"].fmeasure
        rL += s["rougeL"].fmeasure

    print(f"[ROUGE] R1={r1/10:.4f}  R2={r2/10:.4f}  RL={rL/10:.4f}  Score={(r1+r2+rL)/30:.4f}")

In [None]:
# =============================
#   6. 训练模型
# =============================
if run_train:
    print(f"Training model from: {run_model}")
    model = T5ForConditionalGeneration.from_pretrained(run_model).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    best_val_loss = float("inf")
    log_interval = 100  # 每隔多少步执行验证
    global_step = 0  # 实际执行批次步数

    for epoch in range(EPOCHS):
        print(f"===== Epoch {epoch+1} =====")
        model.train()

        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

        optimizer.zero_grad()  # 在epoch开始时清零梯度

        for i, batch in enumerate(pbar):
            global_step += 1

            batch = {k: v.to(device) for k, v in batch.items()}
            out = model(**batch)
            loss = out.loss

            pbar.set_postfix({"loss": f"{loss.item():.4f}", "step": global_step})

            # 归一化损失并反向传播
            loss = loss / accumulation_steps
            loss.backward()

            # 累计步数达到时更新参数
            if global_step % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            # 验证逻辑
            if global_step % log_interval == 0:
                model.eval()
                val_loss = 0

                with torch.no_grad():
                    for vb in val_loader:
                        vb = {k: v.to(device) for k, v in vb.items()}
                        out = model(**vb)
                        val_loss += out.loss.item()

                val_loss /= len(val_loader)

                pbar.set_postfix({"loss": f"{loss.item() * accumulation_steps:.4f}", "val_loss": f"{val_loss:.4f}", "step": global_step})

                print_sample_rouge()

                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    model.save_pretrained(local_model)
                    print(f"✨ Saved best model (val_loss = {best_val_loss:.4f})\n")

                model.train()

        pbar.close()

In [None]:
# =============================
#   7. 测试集推理
# =============================
if run_infer:
    print(f"Running inference with model: {run_model}")
    model = T5ForConditionalGeneration.from_pretrained(run_model).to(device)
    model.eval()

    summaries = []

    for _, row in tqdm(test_df.iterrows(), total=len(test_df)):
        text = "summarize: " + row["dialogue"]
        inp = tokenizer(text, return_tensors="pt", truncation=True).to(device)

        pred_ids = model.generate(
            **inp,
            max_length=100, # 最大生成长度，避免摘要过长
            min_length=10,  # 最小生成长度，避免摘要过短
            num_beams=6,  # 束搜索，提升质量
            early_stopping=True,  # 提前停止
            no_repeat_ngram_size=3,  # 避免重复n-gram
            length_penalty=1.2,  # 适中的长度鼓励
            repetition_penalty=1.1,  # 防止重复
            do_sample=False,  # 使用确定性生成
        )
        pred = tokenizer.decode(pred_ids[0], skip_special_tokens=True)

        summaries.append(pred)

    out = pd.DataFrame(
        {
            "id": test_df["id"],
            "summary": summaries,
        }
    )

    out_path = f"{local_model}/submission.csv"
    out.to_csv(out_path, index=False)
    print(f"Saved submission to {out_path}")