## 安裝

In [1]:
import json

file_path = "T5_fine_tuning_data_augmentation_github.ipynb"  # 你的 notebook 路徑

with open(file_path, "r", encoding="utf-8") as f:
    notebook = json.load(f)

# 修正 widgets metadata
if "widgets" in notebook["metadata"]:
    if "state" not in notebook["metadata"]["widgets"]:
        notebook["metadata"]["widgets"]["state"] = {}

with open(file_path, "w", encoding="utf-8") as f:
    json.dump(notebook, f, indent=2)

In [None]:
!pip install transformers datasets evaluate scikit-learn 

In [None]:
!pip install peft rouge_score bert_score

In [None]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback
)
import random
import json
import gc
import torch
import os
import evaluate
import numpy as np

seed = 188
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_path = "/content/drive/MyDrive/data/train.json"
test_path = "/content/drive/MyDrive/data/test.json"

## T5 Full Fine Tuning with Data Augmentation

In [None]:
# === 載入 paraphrasing 模型 ===
paraphrase_model_id = "Vamsi/T5_Paraphrase_Paws"
paraphrase_tokenizer = AutoTokenizer.from_pretrained(paraphrase_model_id)
paraphrase_model = AutoModelForSeq2SeqLM.from_pretrained(paraphrase_model_id).to("cuda")

# === 載入原始資料 ===
with open(train_path, "r", encoding="utf-8") as f:
    full_data = [json.loads(line) for line in f]

dataset = Dataset.from_list(full_data)

# === 產生 paraphrased version ===
def paraphrase_batch(batch):
    inputs = ["paraphrase: " + text + " </s>" for text in batch["introduction"]]
    tokenized = paraphrase_tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to("cuda")
    outputs = paraphrase_model.generate(**tokenized, max_length=600, num_return_sequences=1, do_sample=True, top_k=120, top_p=0.95)
    decoded = paraphrase_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return {"augmented_introduction": decoded}

augmented_dataset = dataset.map(paraphrase_batch, batched=True, batch_size=8)

# === 隨機選取 50% paraphrased 資料，並保留原始 introduction ===
random.seed(42)
random_augmented_data = []
for original, paraphrased in zip(full_data, augmented_dataset["augmented_introduction"]):
    if random.random() < 0.5:
        new_example = original.copy()
        new_example["original_intro"] = original["introduction"]
        new_example["introduction"] = paraphrased
        random_augmented_data.append(new_example)

print(f"隨機選取的 paraphrased 資料數量：{len(random_augmented_data)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

隨機選取的 paraphrased 資料數量：202


In [None]:
# === BERTScore 過濾隨機選出的 paraphrased 結果 ===
from bert_score import score

metric_bertscore = evaluate.load("bertscore")
originals = [ex["original_intro"] for ex in random_augmented_data]
paraphrased = [ex["introduction"] for ex in random_augmented_data]

P, R, F1 = score(paraphrased, originals, lang="en", verbose=True)

# 設定門檻
threshold = 0.85
bert_filtered_data = []
for ex, f1_score in zip(random_augmented_data, F1):
    if f1_score.item() > threshold:
        ex.pop("original_intro")
        bert_filtered_data.append(ex)

print(f"BERTScore > {threshold} 的 paraphrased 資料數量：{len(bert_filtered_data)}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/7 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 6.16 seconds, 32.77 sentences/sec
BERTScore > 0.85 的 paraphrased 資料數量：197


In [None]:
full_data_augmented = full_data + bert_filtered_data
print(f"full_data_augmented的筆數：{len(full_data_augmented)}")

train_data, val_data = train_test_split(full_data_augmented, test_size=0.1, random_state=42)

dataset = DatasetDict({
    "train": Dataset.from_list(train_data),
    "validation": Dataset.from_list(val_data)
})

for i in range(3):
    print(f"\n[{i + 1}] paper_id: {dataset['train'][i]['paper_id']}")
    print("Introduction:", dataset['train'][i]['introduction'][:300], "...")
    print("Abstract:", dataset['train'][i]['abstract'])

full_data_augmented的筆數：605

[1] paper_id: 196
Introduction: Graph neural networks (GNNs), as generalizations of neural networks in analyzing graphs, have attracted considerable research attention.GNNs have been widely applied to various applications such as social recommendation (Ma et al., 2019), physical simulation (Kipf et al., 2018), and protein interact ...
Abstract: In the ever-evolving landscape of machine learning, Graph Neural Networks (GNNs) have emerged as powerful tools for understanding the intricate web of relationships within graphs. At the heart of their prowess lies a remarkable trait: the ability to generate node representations that are permutation-equivariant. This property, while advantageous for certain applications, presents a conundrum. It inadvertently hinders GNNs from grasping the nuances of proximity—those vital connections that reflect the closeness between nodes based on their paths through the graph.   While some GNN variants have attempted to address thi

In [None]:
print(f"train_data的筆數：{len(train_data)}")

train_data的筆數：544


In [None]:
# === 參數設定 ===
model_name = "google/flan-t5-base"

max_input_length = 2048
max_target_length = 600

In [None]:
# === 載入 tokenizer 與模型 ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.gradient_checkpointing_enable()

# === 預處理函式 ===
def preprocess(example):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    inputs = prompt + example["introduction"]
    targets = example["abstract"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")["input_ids"]
        labels = [t if t != tokenizer.pad_token_id else -100 for t in labels]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=False)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/544 [00:00<?, ? examples/s]



Map:   0%|          | 0/61 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"], skip_special_tokens=True))

You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone. Introduction: Graph neural networks (GNNs), as generalizations of neural networks in analyzing graphs, have attracted considerable research attention.GNNs have been widely applied to various applications such as social recommendation (Ma et al., 2019), physical simulation (Kipf et al., 2018), and protein interaction prediction (Zitnik & Leskovec, 2017).One key property of most existing GNNs is permutation-equivariance, i.e., if we randomly permutate the IDs of nodes while maintaining the graph structure, the representations of nodes in GNNs are permuta

In [None]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"🧠 Total parameters: {total_params:,}")
print(f"💾 Approx. model size: {total_size:.2f} MB")

🧠 Total parameters: 247,577,856
💾 Approx. model size: 944.43 MB


In [None]:
# === 訓練參數設定 ===
# === 訓練參數設定（已優化記憶體） ===
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_17",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2.5e-5,
    num_train_epochs=20,
    predict_with_generate=False,
    fp16=False,
    bf16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    label_smoothing_factor = 0.1,
    warmup_steps=500,
    lr_scheduler_type="linear",
    greater_is_better=False,
    report_to="none",
    seed = seed
)


# === 建立 Data Collator（避免 loss 為 nan） ===
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

# === 開始訓練 ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)
trainer.train()

  trainer = Seq2SeqTrainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,4.0796,3.892121
2,3.9577,3.711105
3,3.6272,3.622095
4,3.6034,3.563588
5,3.5402,3.523759
6,3.6807,3.490047
7,3.4561,3.46842
8,3.2884,3.447311
9,3.3974,3.432079
10,3.3798,3.418773


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=10880, training_loss=3.460408087513026, metrics={'train_runtime': 3504.3784, 'train_samples_per_second': 3.105, 'train_steps_per_second': 3.105, 'total_flos': 2.980063997853696e+16, 'train_loss': 3.460408087513026, 'epoch': 20.0})

In [None]:
# 儲存本地備份
trainer.save_model("t5_17_final")
tokenizer.save_pretrained("t5_17_final")

('t5_17_final/tokenizer_config.json',
 't5_17_final/special_tokens_map.json',
 't5_17_final/spiece.model',
 't5_17_final/added_tokens.json',
 't5_17_final/tokenizer.json')

In [None]:
# === 上傳模型到 Hugging Face Hub ===
from huggingface_hub import login
login("hf_xxxxxxx")

model.push_to_hub("xxx/xxxlora171")
tokenizer.push_to_hub("xxx/xxxlora171")

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NCCUTAT/T5_nolora171/commit/da7a245017e47419409692dece8a6b74bcec850e', commit_message='Upload tokenizer', commit_description='', oid='da7a245017e47419409692dece8a6b74bcec850e', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NCCUTAT/T5_nolora171', endpoint='https://huggingface.co', repo_type='model', repo_id='NCCUTAT/T5_nolora171'), pr_revision=None, pr_num=None)

In [None]:
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

### 驗證

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("xxx/xxxlora171")
tokenizer = AutoTokenizer.from_pretrained("xxx/xxxlora171")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/t5_17_final")
tokenizer = AutoTokenizer.from_pretrained("/content/t5_17_final")

In [None]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"Total parameters: {total_params:,}")
print(f"Approx. model size: {total_size:.2f} MB")

Total parameters: 247,577,856
Approx. model size: 944.43 MB


In [None]:
# === 推理函式 ===
def generate_summary(text):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )

    input_text = prompt + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(model.device)
    outputs = model.generate(
    **inputs,
    max_new_tokens=max_target_length,
    min_length=200,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
from tqdm import tqdm

print("\n=== 驗證集摘要預測（前 3 筆） ===\n")
predictions = []
references = []
prompts = []

for sample in tqdm(dataset["validation"], desc="Generating summaries"):
    article = sample["introduction"]
    gt_abstract = sample.get("abstract", "")

    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: " + article
    )

    summary = generate_summary(article)
    predictions.append(summary.strip())
    prompts.append(prompt.strip())
    references.append(gt_abstract.strip())

for i in range(3):
    print("------------------------------------------------------")
    print(f"[Sample {i + 1}]")
    print("\n▶ Prompt：\n", prompts[i])
    print("\n▶ Ground Truth 摘要：\n", references[i])
    print("\n▶ 模型生成摘要：\n", predictions[i])
    print("------------------------------------------------------\n")


=== 驗證集摘要預測（前 3 筆） ===



Generating summaries: 100%|██████████| 61/61 [17:19<00:00, 17.04s/it]

------------------------------------------------------
[Sample 1]

▶ Prompt：
 You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone.

Introduction: In multiple machine learning and computer vision tasks (e.g., image hashing (Chen et al., 2017; Carreira-Perpinán & Raziperchikolaei, 2016) , descriptor learning (Zhang et al., 2017) , metric learning (Mishchuk et al., 2017) and video summarization (Kulesza et al., 2012;Liu et al., 2012

▶ Ground Truth 摘要：
 In the realm of machine learning and computer vision, determinantal point processes (DPPs) stand as a beacon of diversity, wielding the power to enhance a m




In [None]:
metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")
ground_truths = references
rouge = metric_rouge.compute(predictions=predictions, references=ground_truths, use_stemmer=True)
bertscore = metric_bertscore.compute(predictions=predictions, references=ground_truths, lang="en")
print("=== 評估結果===\n")
print("🔹 ROUGE Scores:")
print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
print(f"  ROUGE-L: {rouge['rougeL']:.4f}")

print("\n🔹 BERTScore (Average):")
print(f"  Precision:  {sum(bertscore['precision']) / len(bertscore['precision']):.4f}")
print(f"  Recall:     {sum(bertscore['recall']) / len(bertscore['recall']):.4f}")
print(f"  F1 Score:   {sum(bertscore['f1']) / len(bertscore['f1']):.4f}")


Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== 評估結果===

🔹 ROUGE Scores:
  ROUGE-1: 0.4509
  ROUGE-2: 0.1455
  ROUGE-L: 0.2321

🔹 BERTScore (Average):
  Precision:  0.8741
  Recall:     0.8537
  F1 Score:   0.8637
