In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

file_path = '/content/drive/MyDrive/nlp_project_02/data/totto_preprocessed_train.json'
df = pd.read_json(file_path)

display(df.head())

Unnamed: 0,id,input,target
0,1762238357686640128,[PAGE] List of 8/9 PM telenovelas of Rede Glob...,A Favorita is the telenovela aired in the 9 pm...
1,7906730525723842560,[PAGE] List of Chicago Bears first-round draft...,The Chicago Bears recent first round selection...
2,6196487034766761984,[PAGE] Brian Ebersole [SEC] Mixed martial arts...,Ebersole made his UFC debut against Chris Lytl...
3,5254211070576122880,[PAGE] 78th United States Congress [SEC] Senat...,William Warren Barbour (R) served as Senate un...
4,-5206051586137920512,[PAGE] Elagabalus [SEC] External links [TEXT] ...,Elagabalus (204 – 11 March 222) was Roman empe...


In [4]:
import json
from dataclasses import dataclass
from typing import List, Dict

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [3]:
def load_json_or_jsonl(path: str) -> List[Dict]:
    if path.endswith(".jsonl"):
        data = []
        with open(path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line:
                    data.append(json.loads(line))
        return data
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


@dataclass
class T5Example:
    input: str
    target: str


class T5Dataset(Dataset):
    def __init__(self, examples: List[T5Example], tokenizer, max_input_len=512, max_target_len=128):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        ex = self.examples[idx]

        model_inputs = self.tokenizer(
            ex.input,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        labels = self.tokenizer(
            ex.target,
            max_length=self.max_target_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        label_ids = labels["input_ids"].squeeze(0)

        model_inputs = {k: v.squeeze(0) for k, v in model_inputs.items()}
        model_inputs["labels"] = label_ids

        return model_inputs


def build_examples(data: List[Dict]) -> List[T5Example]:
    examples = []
    for row in data:
        inp = row.get("input", "")
        tgt = row.get("target", "")

        # ✅ 반드시 strip 체크
        if inp and tgt and inp.strip() and tgt.strip():
            examples.append(T5Example(input=inp, target=tgt))

    return examples


In [13]:
train_input_path = "/content/drive/MyDrive/nlp_project_02/data/totto_preprocessed_train.json"
val_input_path = "/content/drive/MyDrive/nlp_project_02/data/totto_preprocessed_dev.json"
model_name = "t5-base"
output_dir = "/content/drive/MyDrive/t5_totto_ckpt"

train_data = load_json_or_jsonl(train_input_path)
train_examples = build_examples(train_data)
val_data = load_json_or_jsonl(val_input_path)
val_examples = build_examples(val_data)

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

train_dataset = T5Dataset(train_examples, tokenizer)
val_dataset = T5Dataset(val_examples, tokenizer)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
# def count_valid_labels(dataset):
#     bad = 0
#     for i in range(len(dataset)):
#         labels = dataset[i]["labels"]
#         if (labels != -100).sum() == 0:
#             bad += 1
#     return bad

# print("bad samples:", count_valid_labels(train_dataset))
# print("bad samples (val):", count_valid_labels(val_dataset))

In [None]:
print(len(val_dataset))

22293


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,      # ✅ eval 배치도 지정 (권장)
    gradient_accumulation_steps=4,

    learning_rate=3e-4,
    num_train_epochs=3,

    logging_strategy="steps",     # 기본값이긴 하지만 명시 추천
    logging_steps=100,             # n step마다 train loss 출력
    logging_first_step=True,      # 첫 step에서도 출력 (디버깅에 좋음)

    eval_strategy="steps",        # ✅ step마다 평가
    eval_steps=500,                     # ✅ n step마다 eval_loss 출력
    save_steps=500,                     # ✅ eval_steps와 맞추는 게 보통 좋음
    save_total_limit=2,

    load_best_model_at_end=True,        # ✅ eval_loss 기준 best ckpt 로드
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    fp16=False,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

train_result = trainer.train()

# 마지막에 한 번 더 eval loss 확인(선택)
eval_metrics = trainer.evaluate()
print("Final eval metrics:", eval_metrics)

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
500,0.2227,0.196931
1000,0.2025,0.187464
1500,0.1918,0.183484
2000,0.1782,0.180961
2500,0.1814,0.179776


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


Final eval metrics: {'eval_loss': 0.17977598309516907, 'eval_runtime': 266.1344, 'eval_samples_per_second': 83.766, 'eval_steps_per_second': 2.619, 'epoch': 3.0}


('/content/drive/MyDrive/t5_totto_ckpt/tokenizer_config.json',
 '/content/drive/MyDrive/t5_totto_ckpt/special_tokens_map.json',
 '/content/drive/MyDrive/t5_totto_ckpt/spiece.model',
 '/content/drive/MyDrive/t5_totto_ckpt/added_tokens.json',
 '/content/drive/MyDrive/t5_totto_ckpt/tokenizer.json')

추가 epoch

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

ckpt_path = "/content/drive/MyDrive/t5_totto_ckpt/checkpoint-3000"

tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5_totto_ckpt",

    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,      # ✅ eval 배치도 지정 (권장)
    gradient_accumulation_steps=4,

    learning_rate=3e-4,
    num_train_epochs=5,

    logging_strategy="steps",     # 기본값이긴 하지만 명시 추천
    logging_steps=100,             # n step마다 train loss 출력
    logging_first_step=True,      # 첫 step에서도 출력 (디버깅에 좋음)

    eval_strategy="steps",        # ✅ step마다 평가
    eval_steps=500,                     # ✅ n step마다 eval_loss 출력
    save_steps=500,                     # ✅ eval_steps와 맞추는 게 보통 좋음
    save_total_limit=2,

    load_best_model_at_end=True,        # ✅ eval_loss 기준 best ckpt 로드
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    fp16=False,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train(resume_from_checkpoint=ckpt_path)

  trainer = Seq2SeqTrainer(
There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
3500,0.1773,0.181815
4000,0.1746,0.179877
4500,0.1719,0.179299


Step,Training Loss,Validation Loss
3500,0.1773,0.181815
4000,0.1746,0.179877
4500,0.1719,0.179299


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=4695, training_loss=0.06321965462356584, metrics={'train_runtime': 8239.8518, 'train_samples_per_second': 72.859, 'train_steps_per_second': 0.57, 'total_flos': 3.548866525868851e+17, 'train_loss': 0.06321965462356584, 'epoch': 5.0})

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

ckpt_path = "/content/drive/MyDrive/t5_totto_ckpt"

tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5_totto_ckpt_eval",
    per_device_eval_batch_size=8,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [15]:
import numpy as np
from tqdm import tqdm

ckpt_path = "/content/drive/MyDrive/t5_totto_ckpt_decoder/checkpoint-2000"

tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

eval_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5_totto_ckpt_eval",
    per_device_eval_batch_size=8,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=eval_args,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


def generate_predictions(trainer, dataset, tokenizer, max_gen_len=128):
    trainer.model.eval()
    preds = []
    refs = []

    dataloader = trainer.get_eval_dataloader(dataset)

    for batch in tqdm(dataloader):
        batch = {k: v.to(trainer.model.device) for k, v in batch.items()}

        with torch.no_grad():
            generated_ids = trainer.model.generate(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=max_gen_len,
                num_beams=4,
            )

        decoded_preds = tokenizer.batch_decode(
            generated_ids, skip_special_tokens=True
        )
        decoded_refs = tokenizer.batch_decode(
            batch["labels"].masked_fill(batch["labels"] == -100, tokenizer.pad_token_id),
            skip_special_tokens=True,
        )

        preds.extend([p.strip() for p in decoded_preds])
        refs.extend([r.strip() for r in decoded_refs])

    return preds, refs


preds, refs = generate_predictions(trainer, val_dataset, tokenizer)

  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)
100%|██████████| 2787/2787 [43:00<00:00,  1.08it/s]


In [16]:
import json

def save_predictions_to_json(preds, refs, save_path):
    assert len(preds) == len(refs)

    outputs = []
    for p, r in zip(preds, refs):
        outputs.append({
            "prediction": p,
            "reference": r
        })

    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(outputs, f, ensure_ascii=False, indent=2)

In [16]:
!pip install sacrebleu



In [18]:
import sacrebleu

bleu = sacrebleu.corpus_bleu(preds, [refs])
print("BLEU score:", bleu.score)

TypeError: BLEU: `refs` should be a sequence of sequence of strings.

In [32]:
save_path = "/content/drive/MyDrive/nlp_project_02/t5_val_predictions_decoder.json"
save_predictions_to_json(preds, refs, save_path)

print(f"Saved predictions to {save_path}")

Saved predictions to /content/drive/MyDrive/nlp_project_02/t5_val_predictions_decoder.json


parent_metric 함수

In [33]:
# parent_metric.py 내용 그대로
import collections
from collections import Counter

def get_ngrams(segment, max_order):
    """텍스트에서 1~max_order까지의 n-gram을 추출하여 카운트합니다."""
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1
    return ngram_counts

def parent_score(predictions, references, tables, lambda_weight=0.5):
    """
    PARENT 지표를 계산합니다.
    Args:
        predictions: 생성된 문장 리스트 (List[str])
        references: 정답 문장 리스트의 리스트 (List[List[str]])
        tables: 테이블 데이터 리스트 (List[List[str]]) - 각 테이블은 셀 값(문자열)들의 리스트
        lambda_weight: Precision 계산 시 테이블과 레퍼런스 비중 (기본 0.5)
    Returns:
        precision, recall, f1 (float, float, float)
    """
    total_precision = 0.0
    total_recall = 0.0
    total_f1 = 0.0

    max_order = 4
    smoothing = 1e-13

    for pred_text, ref_texts, table_texts in zip(predictions, references, tables):
        pred_tokens = pred_text.strip().split()
        ref_tokens_list = [ref.strip().split() for ref in ref_texts]

        table_tokens = []
        for cell_value in table_texts:
            table_tokens.extend(str(cell_value).split())

        pred_ngrams = get_ngrams(pred_tokens, max_order)
        ref_ngrams_list = [get_ngrams(ref, max_order) for ref in ref_tokens_list]
        table_ngrams = get_ngrams(table_tokens, max_order)

        numerator_prec = 0.0
        denominator_prec = sum(pred_ngrams.values()) + smoothing

        for ngram, count in pred_ngrams.items():
            prob_in_table = 1.0 if ngram in table_ngrams else 0.0
            prob_in_ref = 0.0
            for ref_ngrams in ref_ngrams_list:
                prob_in_ref = max(prob_in_ref, min(1.0, ref_ngrams.get(ngram, 0) / count))

            w_prob = prob_in_table + prob_in_ref * (1.0 - prob_in_table)
            numerator_prec += count * w_prob

        precision = numerator_prec / denominator_prec

        numerator_rec = 0.0
        denominator_rec = 0.0 + smoothing

        best_recall = 0.0

        for ref_ngrams in ref_ngrams_list:
            curr_num = 0.0
            curr_denom = 0.0 + smoothing

            for ngram, count in ref_ngrams.items():
                if ngram in table_ngrams:
                    curr_denom += count
                    if ngram in pred_ngrams:
                        curr_num += min(count, pred_ngrams[ngram])

            if curr_denom > smoothing:
                best_recall = max(best_recall, curr_num / curr_denom)

        recall = best_recall

        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0

        total_precision += precision
        total_recall += recall
        total_f1 += f1

    n = len(predictions)
    return (total_precision / n), (total_recall / n), (total_f1 / n)


overall parent 결과 확인

In [34]:
import json

pred_path = "/content/drive/MyDrive/nlp_project_02/t5_val_predictions_decoder.json"  # 예: JSON 배열
dev_path = "/content/drive/MyDrive/nlp_project_02/data/totto_dev_data.jsonl"

# 1) 예측 로드 (JSON 배열)
with open(pred_path, "r", encoding="utf-8") as f:
    preds_data = json.load(f)

# 2) dev 펼치기 (reference 문장 기준으로 flatten)
flat_dev = []
with open(dev_path, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        annos = ex.get("sentence_annotations", [])
        for anno in annos:
            ref = anno.get("final_sentence", "")
            if ref:
                flat_dev.append({
                    "reference": ref,
                    "table": ex.get("table", []),
                })

print("preds:", len(preds_data))
print("flat_dev:", len(flat_dev))

# 3) 길이 맞는지 확인
min_len = min(len(preds_data), len(flat_dev))

preds = []
refs = []
tables = []

for i in range(min_len):
    pred_ex = preds_data[i]
    dev_ex = flat_dev[i]

    preds.append(pred_ex["prediction"])
    refs.append([dev_ex["reference"]])

    raw_table = dev_ex["table"]
    table_values = []
    for row in raw_table:
        for cell in row:
            val = cell.get("value", "")
            if val:
                table_values.append(str(val))
    tables.append(table_values)


# 4) PARENT 계산
precision, recall, f1 = parent_score(preds, refs, tables)
print(f"PARENT Precision: {precision*100:.2f}")
print(f"PARENT Recall:    {recall*100:.2f}")
print(f"PARENT F1:        {f1*100:.2f}")

preds: 22293
flat_dev: 22293
PARENT Precision: 43.22
PARENT Recall:    76.72
PARENT F1:        51.29


prediction+reference+table을 json 파일 저장

In [40]:
out_path = "/content/drive/MyDrive/nlp_project_02/t5_val_predictions_with_table_decoder.jsonl"

# 예측 로드
with open(pred_path, "r", encoding="utf-8") as f:
    preds_data = json.load(f)

# dev 펼치기
flat_dev = []
with open(dev_path, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        annos = ex.get("sentence_annotations", [])
        for anno in annos:
            ref = anno.get("final_sentence", "")
            if ref:
                flat_dev.append({
                    "reference": ref,
                    "table": ex.get("table", []),
                })

assert len(preds_data) == len(flat_dev), "길이가 다릅니다."

# 합쳐서 저장
with open(out_path, "w", encoding="utf-8") as fout:
    for pred_ex, dev_ex in zip(preds_data, flat_dev):
        merged = {
            "prediction": pred_ex["prediction"],
            "reference": dev_ex["reference"],
            "table": dev_ex["table"],
        }
        fout.write(json.dumps(merged, ensure_ascii=False) + "\n")

print("saved:", out_path)

saved: /content/drive/MyDrive/nlp_project_02/t5_val_predictions_with_table_decoder.jsonl


overlap/nonoverlap 분리해서 parent 결과 확인

In [41]:
import json

pred_path = "/content/drive/MyDrive/nlp_project_02/t5_val_predictions_2.json"
dev_path = "/content/drive/MyDrive/nlp_project_02/data/totto_dev_data.jsonl"
out_path = "/content/drive/MyDrive/nlp_project_02/t5_val_predictions_with_table_with_flag.jsonl"

# 1) 예측 로드 (JSON 배열)
with open(pred_path, "r", encoding="utf-8") as f:
    preds_data = json.load(f)

# 2) dev 펼치기 (+ overlap_subset 포함)
flat_dev = []
with open(dev_path, "r", encoding="utf-8") as f:
    for line in f:
        ex = json.loads(line)
        annos = ex.get("sentence_annotations", [])
        flag = ex.get("overlap_subset", "unknown")
        for anno in annos:
            ref = anno.get("final_sentence", "")
            if ref:
                flat_dev.append({
                    "reference": ref,
                    "table": ex.get("table", []),
                    "overlap_subset": flag,
                })

assert len(preds_data) == len(flat_dev), "길이가 다릅니다."

# 3) 합쳐서 저장
with open(out_path, "w", encoding="utf-8") as fout:
    for pred_ex, dev_ex in zip(preds_data, flat_dev):
        merged = {
            "prediction": pred_ex["prediction"],
            "reference": dev_ex["reference"],
            "table": dev_ex["table"],
            "overlap_subset": dev_ex["overlap_subset"],
        }
        fout.write(json.dumps(merged, ensure_ascii=False) + "\n")

print("saved:", out_path)

# 4) overlap / nonoverlap 평가
def eval_parent_by_flag(records, flag_value):
    preds, refs, tables = [], [], []
    for r in records:
        if r.get("overlap_subset") != flag_value:
            continue
        preds.append(r["prediction"])
        refs.append([r["reference"]])

        table_values = []
        for row in r["table"]:
            for cell in row:
                val = cell.get("value", "")
                if val:
                    table_values.append(str(val))
        tables.append(table_values)

    if not preds:
        return None
    return parent_score(preds, refs, tables)

# merged 로드
records = []
with open(out_path, "r", encoding="utf-8") as f:
    for line in f:
        records.append(json.loads(line))

# flag 값 확인 (예: "overlap", "nonoverlap")
flags = sorted(set(r.get("overlap_subset") for r in records))
print("flags:", flags)

for flag in flags:
    result = eval_parent_by_flag(records, flag)
    if result is None:
        print(f"{flag}: no samples")
        continue
    p, r, f1 = result
    print(f"{flag} -> PARENT P/R/F1: {p*100:.2f} / {r*100:.2f} / {f1*100:.2f}")


saved: /content/drive/MyDrive/nlp_project_02/t5_val_predictions_with_table_with_flag.jsonl
flags: [False, True]
False -> PARENT P/R/F1: 37.79 / 74.78 / 46.42
True -> PARENT P/R/F1: 48.63 / 79.98 / 56.54


In [24]:
import sacrebleu

def eval_metrics_by_flag(records, flag_value):
    preds, refs, tables = [], [], []
    for r in records:
        if r.get("overlap_subset") != flag_value:
            continue
        preds.append(r["prediction"])
        refs.append([r["reference"]])  # list of lists

        table_values = []
        for row in r["table"]:
            for cell in row:
                val = cell.get("value", "")
                if val:
                    table_values.append(str(val))
        tables.append(table_values)

    if not preds:
        return None

    # BLEU
    bleu = sacrebleu.corpus_bleu(preds, [ [r[0] for r in refs] ])

    # PARENT
    precision, recall, f1 = parent_score(preds, refs, tables)

    return bleu.score, precision, recall, f1

for flag in flags:
    result = eval_metrics_by_flag(records, flag)
    if result is None:
        print(f"{flag}: no samples")
        continue
    bleu, p, r, f1 = result
    print(f"{flag} -> BLEU: {bleu:.2f} | PARENT P/R/F1: {p*100:.2f} / {r*100:.2f} / {f1*100:.2f}")


False -> BLEU: 30.46 | PARENT P/R/F1: 37.79 / 74.78 / 46.42
True -> BLEU: 44.50 | PARENT P/R/F1: 48.63 / 79.98 / 56.54


decoder만 파인튜닝하기

In [29]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

def set_trainable(module, flag):
    for p in module.parameters():
        p.requires_grad = flag

# 1) 전부 freeze
set_trainable(model, False)

# 2) 디코더만 학습
set_trainable(model.decoder, True)

# (선택) shared embedding도 함께 학습하고 싶으면 켬
# set_trainable(model.shared, True)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/MyDrive/t5_totto_ckpt_decoder",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=1,
    logging_steps=100,
    eval_strategy="steps",        # ✅ step마다 평가
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    fp16=False,
    report_to="none",
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
500,0.1739,0.181762
1000,0.1804,0.179978
1500,0.1865,0.180805
2000,0.1776,0.179137
2500,0.1762,0.179935
3000,0.1698,0.17991
3500,0.1738,0.180006


KeyboardInterrupt: 