## 安裝

In [None]:
!pip install transformers datasets evaluate scikit-learn

In [None]:
!pip install peft rouge_score bert_score

In [None]:
!pip install -U bitsandbytes

In [None]:
import json
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM,
    T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq, EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType, PeftModel
import random
import numpy as np
import torch
import os
import gc
import torch
import evaluate

seed = 2024

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [None]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
train_path = "/content/drive/MyDrive/data/train.json"
test_path = "/content/drive/MyDrive/data/test.json"

## T5_lora

In [None]:
# === 參數設定 ===
model_name = "google/flan-t5-xl"

max_input_length = 3096
max_target_length = 600

# === 載入並切分資料 ===
with open(train_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)
train_ds = Dataset.from_list(train_data)
val_ds = Dataset.from_list(val_data)
dataset = DatasetDict({"train": train_ds, "validation": val_ds})

In [None]:
# === 載入 tokenizer 與模型 ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.gradient_checkpointing_enable()

# === 預處理 ===
def preprocess(example):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    inputs = prompt + example["introduction"]
    targets = example["abstract"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding="max_length")["input_ids"]
        labels = [t if t != tokenizer.pad_token_id else -100 for t in labels]
    model_inputs["labels"] = labels
    return model_inputs

tokenized_dataset = dataset.map(preprocess, batched=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/367 [00:00<?, ? examples/s]



Map:   0%|          | 0/41 [00:00<?, ? examples/s]

In [None]:
print(tokenizer.decode(tokenized_dataset["train"][0]["input_ids"], skip_special_tokens=True))

You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone. Introduction: With the rapid development of deep vision detection technology in artificial intelligence, detecting anomalies/defects on the surface of industrial products has received unprecedented attention.Changeover in manufacturing refers to converting a line or machine from processing one product to another.Since the equipment has not been completely fine-tuned after the start of the production line, changeover frequently results in unsatisfactory anomaly detection (AD) performance.How to achieve rapid training of industrial product models in the 

In [None]:
# === LORA ===
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q", "v", "k", "o"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 18,874,368 || all params: 2,868,631,552 || trainable%: 0.6580


In [None]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"🧠 Total parameters: {total_params:,}")
print(f"💾 Approx. model size: {total_size:.2f} MB")

🧠 Total parameters: 2,868,631,552
💾 Approx. model size: 10942.96 MB


In [None]:
# === 訓練參數設定 ===
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5_43",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=5e-5,
    num_train_epochs=3,
    warmup_ratio=0.1,
    predict_with_generate=True,
    fp16=False,
    bf16=True,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    label_smoothing_factor=0.1,
    greater_is_better=False,
    optim="adafactor",
    lr_scheduler_type="cosine",
    report_to="none",
    seed=seed
)

# === 建立 Data Collator（避免 loss 為 nan） ===
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100,
    padding=True
)

# === 開始訓練 ===
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)
trainer.train()


  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss
1,3.5881,3.47811
2,3.3715,3.32772


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


TrainOutput(global_step=549, training_loss=3.5816610437056187, metrics={'train_runtime': 2697.0451, 'train_samples_per_second': 0.408, 'train_steps_per_second': 0.204, 'total_flos': 5.706367329632256e+16, 'train_loss': 3.5816610437056187, 'epoch': 2.9863760217983653})

In [None]:
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model = PeftModel.from_pretrained(base_model, "/content/t5_43/checkpoint-549", is_trainable=True)

model.print_trainable_parameters()
：
lora_params = sum(p.numel() for n, p in model.named_parameters() if "lora_" in n)
print(f"LoRA 導入參數數量：{lora_params:,}")

model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 18,874,368 || all params: 2,868,631,552 || trainable%: 0.6580
LoRA 導入參數數量：18,874,368


In [None]:
print(type(model))

<class 'transformers.models.t5.modeling_t5.T5ForConditionalGeneration'>


In [None]:
model.save_pretrained("t5_43_final")
tokenizer.save_pretrained("t5_43_final")

('t5_43_final/tokenizer_config.json',
 't5_43_final/special_tokens_map.json',
 't5_43_final/spiece.model',
 't5_43_final/added_tokens.json',
 't5_43_final/tokenizer.json')

In [None]:
from huggingface_hub import login
login("hfxxxxx")

model.push_to_hub("xxx/lora43")
tokenizer.push_to_hub("xxx/lora43")

model-00002-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NCCUTAT/T5_lora43/commit/8eee9d2e404682cf17df37570132bb75875d0eac', commit_message='Upload tokenizer', commit_description='', oid='8eee9d2e404682cf17df37570132bb75875d0eac', pr_url=None, repo_url=RepoUrl('https://huggingface.co/NCCUTAT/T5_lora43', endpoint='https://huggingface.co', repo_type='model', repo_id='NCCUTAT/T5_lora43'), pr_revision=None, pr_num=None)

In [None]:
del model
del tokenizer
gc.collect()
torch.cuda.empty_cache()

### 驗證

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import login
login("hfxxxxx")

model = AutoModelForSeq2SeqLM.from_pretrained("xxx/lora43")
tokenizer = AutoTokenizer.from_pretrained("xxx/lora43")

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("/content/t5_43_final")
tokenizer = AutoTokenizer.from_pretrained("/content/t5_43_final")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
from functools import reduce
from operator import mul

total_params = sum(p.numel() for p in model.parameters())
total_size = sum(p.numel() * p.element_size() for p in model.parameters()) / (1024 ** 2)

print(f"Total parameters: {total_params:,}")
print(f"Approx. model size: {total_size:.2f} MB")

Total parameters: 2,849,757,184
Approx. model size: 10870.96 MB


In [None]:
# === 推理函式 ===
def generate_summary(text):
    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: "
    )
    input_text = prompt + text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=max_input_length).to(model.device)
    outputs = model.generate(
    **inputs,
    max_new_tokens=max_target_length,
    min_length=200,
    num_beams=4,
    early_stopping=True,
    repetition_penalty=1.2,
    no_repeat_ngram_size=3,
)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
from tqdm import tqdm

print("\n=== 驗證集摘要預測（前 2 筆） ===\n")
predictions = []
references = []
prompts = []

for sample in tqdm(dataset["validation"].select(range(4)), desc="Generating summaries"):
    article = sample["introduction"]
    gt_abstract = sample.get("abstract", "")

    prompt = (
    "You are a professional academic summarizer. "
    "Write a precise and objective abstract for the following research introduction. "
    "Do not include poetic or exaggerated language. "
    "Only describe the main objectives, methods, and key findings of the paper. "
    "If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. "
    "Do not add personal opinions or restate this prompt. Use a formal academic tone.\n\n"
    "Introduction: " + article
    )

    summary = generate_summary(article)
    predictions.append(summary.strip())
    prompts.append(prompt.strip())
    references.append(gt_abstract.strip())

for i in range(2):
    print("------------------------------------------------------")
    print(f"[Sample {i + 1}]")
    print("\n▶ Prompt：\n", prompts[i])
    print("\n▶ Ground Truth 摘要：\n", references[i])
    print("\n▶ 模型生成摘要：\n", predictions[i])
    print("------------------------------------------------------\n")


=== 驗證集摘要預測（前 3 筆） ===



Generating summaries: 100%|██████████| 4/4 [12:19<00:00, 184.87s/it]

------------------------------------------------------
[Sample 1]

▶ Prompt：
 You are a professional academic summarizer. Write a precise and objective abstract for the following research introduction. Do not include poetic or exaggerated language. Only describe the main objectives, methods, and key findings of the paper. If the text contains formulas, mathematical notations, or specific numerical results, retain them in the abstract. Do not add personal opinions or restate this prompt. Use a formal academic tone.

Introduction: The channel configuration (a.k.a.. filter numbers or channel numbers) of a neural network plays a critical role in its affordability on resource constrained platforms, such as mobile phones, wearables and Internet of Things (IoT) devices.The most common constraints (Liu et al., 2017b;Huang et al., 2017;Wang et al., 2017;Han et al., 2015a), i.e., latency, FLOPs and runtime memory footprint, are all bound to the number of channels.For example, in a single convolu




In [None]:
metric_rouge = evaluate.load("rouge", rouge_types=["rouge1", "rouge2", "rougeL"])
metric_bertscore = evaluate.load("bertscore")

ground_truths = references

# 計算 ROUGE 和 BERTScore
rouge = metric_rouge.compute(predictions=predictions, references=ground_truths, use_stemmer=True)
bertscore = metric_bertscore.compute(predictions=predictions, references=ground_truths, lang="en")

print("=== 評估結果===\n")
print("🔹 ROUGE Scores:")
print(f"  ROUGE-1: {rouge['rouge1']:.4f}")
print(f"  ROUGE-2: {rouge['rouge2']:.4f}")
print(f"  ROUGE-L: {rouge['rougeL']:.4f}")

print("\n🔹 BERTScore (Average):")
print(f"  Precision:  {sum(bertscore['precision']) / len(bertscore['precision']):.4f}")
print(f"  Recall:     {sum(bertscore['recall']) / len(bertscore['recall']):.4f}")
print(f"  F1 Score:   {sum(bertscore['f1']) / len(bertscore['f1']):.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


=== 評估結果===

🔹 ROUGE Scores:
  ROUGE-1: 0.4381
  ROUGE-2: 0.1284
  ROUGE-L: 0.2112

🔹 BERTScore (Average):
  Precision:  0.8477
  Recall:     0.8307
  F1 Score:   0.8391
