## 安裝

In [1]:
import json

file_path = "T5_fullfinetuning_github.ipynb"  # 你的 notebook 路徑

with open(file_path, "r", encoding="utf-8") as f:
    notebook = json.load(f)

# 修正 widgets metadata
if "widgets" in notebook["metadata"]:
    if "state" not in notebook["metadata"]["widgets"]:
        notebook["metadata"]["widgets"]["state"] = {}

with open(file_path, "w", encoding="utf-8") as f:
    json.dump(notebook, f, indent=2)


In [None]:
!pip install transformers datasets evaluate scikit-learn

In [None]:
!pip install peft rouge_score bert_score

In [3]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [4]:
from google.colab import drive

# 掛載 Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# 設定資料夾路徑
train_path = "/content/drive/MyDrive/data/train.json"
test_path = "/content/drive/MyDrive/data/test.json"

In [6]:
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback
)
from google.colab import files
from tqdm import tqdm

import random
import numpy as np
import torch
import json
import shutil
import os
import evaluate
import gc

In [7]:
seed = 18888888

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

## Full fine tuning google/flan-t5-base

In [8]:
# === 參數設定 ===
model_name = "google/flan-t5-base"

max_input_length = 2048
max_target_length = 600

# === 載入並切分資料 ===
with open(train_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

In [9]:
# === 載入 tokenizer 與模型 ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.gradient_checkpointing_enable()

# === 預處理函式 ===
def preprocess(example):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    inputs = prompt + example["introduction"]
    targets = example["abstract"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")["input_ids"]
        labels = [t if t != tokenizer.pad_token_id else -100 for t in labels]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]



Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [10]:
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"], skip_special_tokens=True))

You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone. Introduction: With the rapid development of deep vision detection technology in artificial intelligence, detecting anomalies/defects on the surface of industrial products has received unprecedented attention.Changeover in manufacturing refers to converting a line or machine from processing one product to another.Since the equipment has not been completely fine-tuned after the start of the production line, changeover frequently results in unsatisfactory anomaly detection (AD) performance.How to achieve rapid training of industrial product models in the 

In [11]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"Total parameters: {total_params:,}")
print(f"Approx. model size: {total_size:.2f} MB")

Total parameters: 247,577,856
Approx. model size: 944.43 MB


In [12]:
# === 訓練參數設定 ===
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_501",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=2.5e-5,
    num_train_epochs=20,
    predict_with_generate=False,
    fp16=False,
    bf16=True,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    label_smoothing_factor=0.1,
    warmup_steps=500,
    lr_scheduler_type="linear",
    greater_is_better=False,
    report_to="none",
    seed=seed
)

# === Data collator（避免 loss 為 nan）===
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

# === 訓練器 Trainer ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

trainer.train()

  trainer = Seq2SeqTrainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
1,4.0933,3.888575
2,3.7537,3.686254
3,3.7535,3.600255
4,3.5665,3.549841
5,3.5258,3.519753
6,3.4793,3.498767
7,3.4827,3.474402
8,3.4796,3.459363
9,3.2644,3.451548
10,3.3466,3.445214


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=7340, training_loss=3.425368244836376, metrics={'train_runtime': 2416.1696, 'train_samples_per_second': 3.038, 'train_steps_per_second': 3.038, 'total_flos': 2.010447586787328e+16, 'train_loss': 3.425368244836376, 'epoch': 20.0})

In [13]:
# 儲存本地備份
trainer.save_model("t5_504_final")
tokenizer.save_pretrained("t5_504_final")

('t5_504_final/tokenizer_config.json',
 't5_504_final/special_tokens_map.json',
 't5_504_final/spiece.model',
 't5_504_final/added_tokens.json',
 't5_504_final/tokenizer.json')

In [None]:
# === 上傳模型到 Hugging Face Hub ===
#from huggingface_hub import login
#login("hf_xxxx")

#model.push_to_hub("xxxx/T504")
#tokenizer.push_to_hub("xxxx/T504")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/NCCUTAT/T5_nolora504/commit/53cc6f9dcd39b7cb5cbe7be58310342cf0878718', commit_message='Upload tokenizer', commit_description='', oid='53cc6f9dcd39b7cb5cbe7be58310342cf0878718', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NCCUTAT/T5_nolora504', endpoint='https://huggingface.co', repo_type='model', repo_id='NCCUTAT/T5_nolora504'), pr_revision=None, pr_num=None)

In [14]:
checkpoint_dir = "./t5_504"

checkpoints = [d for d in os.listdir(checkpoint_dir) if d.startswith("checkpoint-")]

if not checkpoints:
    print("沒有找到任何 checkpoint，請確認 save_steps 設定！")
else:
    last_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[-1]))[-1]
    last_checkpoint_path = os.path.join(checkpoint_dir, last_checkpoint)
    print(f"找到最後一個 checkpoint: {last_checkpoint_path}")

    zip_filename = f"{last_checkpoint}.zip"
    shutil.make_archive(last_checkpoint, 'zip', last_checkpoint_path)

    files.download(zip_filename)

找到最後一個 checkpoint: ./t5_504/checkpoint-7340


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
zip_path = "./checkpoint-7340.zip"
extract_dir = "./checkpoint-7340.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

checkpoint_path = extract_dir
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

In [None]:
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

### 驗證

In [None]:
from huggingface_hub import login
login("hf_xxxx")

model = AutoModelForSeq2SeqLM.from_pretrained("xxxx/T504")
tokenizer = AutoTokenizer.from_pretrained("xxxx/T504")

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/t5_504_final")
tokenizer = AutoTokenizer.from_pretrained("/content/t5_504_final")

In [None]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"Total parameters: {total_params:,}")
print(f"Approx. model size: {total_size:.2f} MB")

Total parameters: 247,577,856
Approx. model size: 944.43 MB


In [None]:
# === 推理函式 ===
def generate_summary(text):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    input_text = prompt + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(model.device)
    outputs = model.generate(
    **inputs,
    max_new_tokens=max_target_length,
    min_length=200,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print("\n=== 驗證集摘要預測（前 3 筆） ===\n")
predictions = []
references = []
prompts = []

for sample in tqdm(dataset["validation"], desc="Generating summaries"):
    article = sample["introduction"]
    gt_abstract = sample.get("abstract", "")
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: " + article
    )

    summary = generate_summary(article)
    predictions.append(summary.strip())
    prompts.append(prompt.strip())
    references.append(gt_abstract.strip())

for i in range(3):
    print("------------------------------------------------------")
    print(f"[Sample {i + 1}]")
    print("\n▶ Prompt：\n", prompts[i])
    print("\n▶ Ground Truth 摘要：\n", references[i])
    print("\n▶ 模型生成摘要：\n", predictions[i])
    print("------------------------------------------------------\n")


=== 驗證集摘要預測（前 3 筆） ===



Generating summaries: 100%|██████████| 41/41 [13:13<00:00, 19.35s/it]

------------------------------------------------------
[Sample 1]

▶ Prompt：
 You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone.

Introduction: The channel configuration (a.k.a.. filter numbers or channel numbers) of a neural network plays a critical role in its affordability on resource constrained platforms, such as mobile phones, wearables and Internet of Things (IoT) devices.The most common constraints (Liu et al., 2017b;Huang et al., 2017;Wang et al., 2017;Han et al., 2015a), i.e., latency, FLOPs and runtime memory footprint, are all bound to the number of channels.For example, in a single convolu




In [None]:
metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

ground_truths = references
rouge = metric_rouge.compute(predictions=predictions, references=ground_truths, use_stemmer=True)
bertscore = metric_bertscore.compute(predictions=predictions, references=ground_truths, lang="en")

print("=== 評估結果===\n")
print("🔹 ROUGE Scores:")
print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
print(f"  ROUGE-L: {rouge['rougeL']:.4f}")

print("\n🔹 BERTScore (Average):")
print(f"  Precision:  {sum(bertscore['precision']) / len(bertscore['precision']):.4f}")
print(f"  Recall:     {sum(bertscore['recall']) / len(bertscore['recall']):.4f}")
print(f"  F1 Score:   {sum(bertscore['f1']) / len(bertscore['f1']):.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== 評估結果===

🔹 ROUGE Scores:
  ROUGE-1: 0.4861
  ROUGE-2: 0.1613
  ROUGE-L: 0.2471

🔹 BERTScore (Average):
  Precision:  0.8795
  Recall:     0.8596
  F1 Score:   0.8694


### 預測

In [None]:
# === 載入並切分資料 ===
with open(test_path, "r", encoding="utf-8") as f:
    test_data = [json.loads(line) for line in f]

test_ds = Dataset.from_list(test_data)
test_dataset = DatasetDict({"test":test_ds})

In [None]:
print("Test data size:", len(test_dataset["test"]))

Test data size: 103


In [None]:
# === 推理函式 ===
def generate_summary(text):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    input_text = prompt + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(model.device)
    outputs = model.generate(
    **inputs,
    max_new_tokens=max_target_length,
    min_length=200,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
results = []
for sample in tqdm(test_data, desc="Generating summaries"):
    paper_id = sample["paper_id"]
    intro = sample["introduction"]
    summary = generate_summary(intro)
    results.append({
        "paper_id": paper_id,
        "abstract": summary.strip()
    })

Generating summaries: 100%|██████████| 103/103 [12:35<00:00,  7.34s/it]


In [None]:
from google.colab import files

output_path = "generated_abstractsT5_504.jsonl"

with open(output_path, "w", encoding="utf-8") as f:
    for item in results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

files.download(output_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>