In [5]:
# ============================================================
# 0) Install (Colab)
# ============================================================
!pip -q install "transformers>=4.35.0" accelerate sentencepiece sacrebleu

import os, re, random, inspect, pickle
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from tqdm import tqdm

from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    T5Config,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    TrainerCallback,
)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# ============================================================
# 1) TrainingArguments compatibility helper
# ============================================================
def build_seq2seq_args(**kwargs):
    sig = inspect.signature(Seq2SeqTrainingArguments.__init__)
    allowed = set(sig.parameters.keys())

    # evaluation_strategy <-> eval_strategy
    if "evaluation_strategy" in kwargs and "evaluation_strategy" not in allowed and "eval_strategy" in allowed:
        kwargs["eval_strategy"] = kwargs.pop("evaluation_strategy")

    clean = {k: v for k, v in kwargs.items() if k in allowed}
    dropped = [k for k in kwargs.keys() if k not in allowed]
    if dropped:
        print("[Info] Dropped unsupported TrainingArguments:", dropped)

    return Seq2SeqTrainingArguments(**clean)

# ============================================================
# 2) ToTTo parser (NO header tokens in input text)
# ============================================================
def parse_totto_string(input_str):
    if input_str is None or (isinstance(input_str, float) and pd.isna(input_str)):
        input_str = ""
    elif not isinstance(input_str, str):
        input_str = str(input_str)

    page_match = re.search(r'\[PAGE\](.*?)(?=\[SEC\]|\[TEXT\]|\[CELL\]|$)', input_str)
    sec_match  = re.search(r'\[SEC\](.*?)(?=\[TEXT\]|\[CELL\]|$)', input_str)
    text_match = re.search(r'\[TEXT\](.*?)(?=\[CELL\]|$)', input_str)

    page_title    = page_match.group(1).strip() if page_match else ""
    section_title = sec_match.group(1).strip() if sec_match else ""
    free_text     = text_match.group(1).strip() if text_match else ""
    if free_text.lower() == "none":
        free_text = ""

    cell_pattern = re.compile(
        r'\[CELL\](.*?)\[TYPE\].*?\[R_HEAD\](.*?)\[C_HEAD\](.*?)(?=\[CELL\]|$)',
        flags=re.DOTALL
    )

    cells = []
    for m in cell_pattern.finditer(input_str):
        val = (m.group(1) or "").strip()
        row = (m.group(2) or "").strip()
        col = (m.group(3) or "").strip()

        if row == "" or row.lower() == "none":
            row = "None"
        if col == "" or col.lower() == "none":
            col = "None"

        cells.append({"val": val, "row": row, "col": col})

    return {"page": page_title, "section": section_title, "text": free_text, "cells": cells}

# ============================================================
# 3) Header semantic init with CACHE (speed!)
# ============================================================
def prepare_semantic_headers_cached(df, cache_path, base_model_name="t5-base"):
    if os.path.exists(cache_path):
        print(f">>> Loading header cache: {cache_path}")
        with open(cache_path, "rb") as f:
            obj = pickle.load(f)
        return obj["header2id"], obj["init_matrix"]

    print(">>> Scanning headers from data...")
    unique_headers = set(["None"])
    for text in tqdm(df["input"].astype(str).tolist()):
        parsed = parse_totto_string(text)
        for c in parsed["cells"]:
            unique_headers.add(c["row"])
            unique_headers.add(c["col"])

    sorted_headers = ["None"] + sorted([h for h in unique_headers if h != "None"])
    header2id = {h: i for i, h in enumerate(sorted_headers)}

    print(">>> Building Embedding Matrix (first time only)...")
    tok = AutoTokenizer.from_pretrained(base_model_name)
    temp_model = T5ForConditionalGeneration.from_pretrained(base_model_name)
    wte = temp_model.get_input_embeddings()

    init_matrix = torch.zeros((len(sorted_headers), temp_model.config.d_model))
    for i, h in enumerate(tqdm(sorted_headers)):
        if i == 0:
            continue
        ids = tok.encode(h, add_special_tokens=False)
        if not ids:
            continue
        with torch.no_grad():
            vecs = wte(torch.tensor([ids]))
            init_matrix[i] = vecs.mean(dim=1).squeeze(0)

    with open(cache_path, "wb") as f:
        pickle.dump({"header2id": header2id, "init_matrix": init_matrix}, f)
    print(f">>> Saved header cache: {cache_path}")

    return header2id, init_matrix

# ============================================================
# 4) Dataset (ONLY <hl> cells) + labels ÏïàÏ†ïÌôî (NaN Î∞©ÏßÄ ÌïµÏã¨)
# ============================================================
class StructureT5Dataset(Dataset):
    def __init__(self, df, tokenizer, header2id, max_len=512, tgt_max_len=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.header2id = header2id
        self.max_len = max_len
        self.tgt_max_len = tgt_max_len

        self.hl_start = tokenizer.convert_tokens_to_ids("<hl>")
        self.hl_end   = tokenizer.convert_tokens_to_ids("</hl>")
        if self.hl_start in [-1, None] or self.hl_end in [-1, None]:
            raise ValueError("TokenizerÏóê <hl>, </hl>Ïù¥ Îì±Î°ùÎêòÏßÄ ÏïäÏïòÏäµÎãàÎã§.")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df.iloc[idx]
        parsed = parse_totto_string(item["input"])

        context = f"Page: {parsed['page']} | Section: {parsed['section']} | Text: {parsed.get('text','')} "

        cell_texts = []
        map_list = []
        for c in parsed["cells"]:
            val = (c["val"] or "").strip()
            if val == "" or val.lower() == "none":
                continue
            cell_texts.append(f"<hl> {val} </hl>")
            map_list.append((self.header2id.get(c["row"], 0), self.header2id.get(c["col"], 0)))

        if len(cell_texts) == 0:
            cell_texts = ["<hl> None </hl>"]
            map_list = [(0, 0)]

        full_text = context + " ".join(cell_texts)

        tok = self.tokenizer(
            full_text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        input_ids = tok["input_ids"].squeeze(0)
        attention_mask = tok["attention_mask"].squeeze(0)

        row_ids = torch.zeros_like(input_ids)
        col_ids = torch.zeros_like(input_ids)

        cell_idx = 0
        in_hl = False
        ids_list = input_ids.tolist()
        for i, tid in enumerate(ids_list):
            if tid == self.hl_start:
                in_hl = True
                continue
            if tid == self.hl_end:
                in_hl = False
                cell_idx += 1
                continue
            if in_hl and cell_idx < len(map_list):
                row_ids[i] = map_list[cell_idx][0]
                col_ids[i] = map_list[cell_idx][1]

        # ---- labels: Îπà/NaN ÌÉÄÍ≤ü Î∞©ÏßÄ ----
        target = item["target"]
        if target is None or (isinstance(target, float) and pd.isna(target)):
            target = "."
        elif not isinstance(target, str):
            target = str(target)
        target = target.strip()
        if len(target) == 0 or target.lower() == "nan":
            target = "."

        try:
            labels = self.tokenizer(
                text_target=target,
                max_length=self.tgt_max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )["input_ids"].squeeze(0)
        except TypeError:
            labels = self.tokenizer(
                target,
                max_length=self.tgt_max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )["input_ids"].squeeze(0)

        # ÌòπÏãúÎùºÎèÑ Ï†ÑÎ∂Ä padÎ©¥ EOS Ìïú ÌÜ†ÌÅ∞ Í∞ïÏ†ú
        if (labels != self.tokenizer.pad_token_id).sum().item() == 0:
            eos = self.tokenizer.eos_token_id if self.tokenizer.eos_token_id is not None else 1
            labels[0] = eos

        labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids.long(),
            "attention_mask": attention_mask.long(),
            "row_ids": row_ids.long(),
            "col_ids": col_ids.long(),
            "labels": labels.long(),
        }

# ============================================================
# 5) Model: word + delta  (NO custom from_pretrained)
# ============================================================
class SafeStructureT5(T5ForConditionalGeneration):
    def __init__(self, config, header_init_matrix=None, max_scale=0.05, delta_l2=5e-4):
        super().__init__(config)
        self.max_scale = float(max_scale)
        self.delta_l2 = float(delta_l2)

        if header_init_matrix is not None:
            self.row_header_embeddings = nn.Embedding.from_pretrained(header_init_matrix, freeze=False)
            self.col_header_embeddings = nn.Embedding.from_pretrained(header_init_matrix, freeze=False)
        else:
            self.row_header_embeddings = nn.Embedding(4096, config.d_model)
            self.col_header_embeddings = nn.Embedding(4096, config.d_model)

        self.row_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.col_proj = nn.Linear(config.d_model, config.d_model, bias=False)
        self.delta_ln = nn.LayerNorm(config.d_model)
        self.alpha = nn.Parameter(torch.tensor(0.0))

        nn.init.zeros_(self.row_proj.weight)
        nn.init.zeros_(self.col_proj.weight)

    def forward(self, input_ids=None, attention_mask=None, labels=None, row_ids=None, col_ids=None, **kwargs):
        kwargs.pop("num_items_in_batch", None)

        word_embeds = self.shared(input_ids)

        if row_ids is not None and col_ids is not None:
            row_embeds = self.row_header_embeddings(row_ids)
            col_embeds = self.col_header_embeddings(col_ids)

            row_embeds = row_embeds.masked_fill((row_ids == 0).unsqueeze(-1), 0.0)
            col_embeds = col_embeds.masked_fill((col_ids == 0).unsqueeze(-1), 0.0)

            delta = self.row_proj(row_embeds) + self.col_proj(col_embeds)
            delta = self.delta_ln(delta)

            scale = torch.tanh(self.alpha) * self.max_scale
            inputs_embeds = word_embeds + scale * delta
        else:
            delta = None
            scale = None
            inputs_embeds = word_embeds

        outputs = super().forward(
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )

        # delta Í∑úÏ†ú
        if (self.delta_l2 > 0.0) and (delta is not None) and (labels is not None):
            mask = ((row_ids != 0) | (col_ids != 0)).float().unsqueeze(-1)
            if mask.sum() > 0:
                reg = ((scale * delta) ** 2 * mask).sum() / mask.sum()
                outputs.loss = outputs.loss + self.delta_l2 * reg

        return outputs

# ============================================================
# 6) NaN guard callback (NaN ÎÇòÏò§Î©¥ Î∞îÎ°ú Ï§ëÎã®)
# ============================================================
class NanGuardCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        # TrainerÍ∞Ä ÎÑòÍ≤®Ï£ºÎäî lossÎ•º ÏßÅÏ†ë Î∞õÍ∏∞Îäî Ïñ¥Î†§ÏõåÏÑú,
        # log Ïù¥Î≤§Ìä∏ÏóêÏÑú Ï≤¥ÌÅ¨ÌïòÎäî Í≤å Í∞ÄÏû• ÍπîÎÅîÌï®
        return control

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not logs:
            return
        for k, v in logs.items():
            if isinstance(v, float) and (np.isnan(v) or np.isinf(v)):
                raise RuntimeError(f"[NaN/Inf detected] step={state.global_step} {k}={v}")
        return control

# ============================================================
# 7) Paths (EDIT)
# ============================================================
TRAIN_PATH = "/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/totto_preprocessed_train.json"
VAL_PATH   = "/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/totto_preprocessed_dev.json"
TEST_PATH  = "/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/totto_preprocessed_test.json"

OUTPUT_DIR = "/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/safe_structure_t5_word_delta_BIGVAL"
os.makedirs(OUTPUT_DIR, exist_ok=True)
HEADER_CACHE = os.path.join(OUTPUT_DIR, "header_cache.pkl")

# ============================================================
# 8) Load + Clean (NaN/loss Íπ®Ïßê Î∞©ÏßÄ)
# ============================================================
print(">>> Loading data...")
train_df = pd.read_json(TRAIN_PATH).dropna(subset=["input", "target"]).copy()
val_df   = pd.read_json(VAL_PATH).dropna(subset=["input", "target"]).copy()

train_df = train_df[train_df["target"].astype(str).str.strip().str.len() > 0].copy()
val_df   = val_df[val_df["target"].astype(str).str.strip().str.len() > 0].copy()

train_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

print("Train:", len(train_df), "Val:", len(val_df))

# ============================================================
# 9) Tokenizer (+ <hl> tokens)
# ============================================================
tokenizer = AutoTokenizer.from_pretrained("t5-base")
tokenizer.add_special_tokens({"additional_special_tokens": ["<hl>", "</hl>"]})

# ============================================================
# 10) Header cache (ÌÅ∞ Îç∞Ïù¥ÌÑ∞Î©¥ ÌïÑÏàòÎ°ú Ï∫êÏãú!)
# ============================================================
header2id, init_matrix = prepare_semantic_headers_cached(train_df, HEADER_CACHE, base_model_name="t5-base")

# ============================================================
# 11) Dataset
# ============================================================
train_dataset = StructureT5Dataset(train_df, tokenizer, header2id)
val_dataset   = StructureT5Dataset(val_df, tokenizer, header2id)

# ============================================================
# 12) Model build + load base T5 weights safely
# ============================================================
config = T5Config.from_pretrained("t5-base")
model = SafeStructureT5(config, header_init_matrix=init_matrix, max_scale=0.05, delta_l2=5e-4)

base = T5ForConditionalGeneration.from_pretrained("t5-base")
model.load_state_dict(base.state_dict(), strict=False)
model.resize_token_embeddings(len(tokenizer))

# speed options
model.gradient_checkpointing_enable()
model.config.use_cache = False
model.to(device)

# ============================================================
# 13) TrainingArguments (ÎåÄÍ∑úÎ™® VALÏóê ÎßûÍ≤å: "epoch ÌèâÍ∞Ä" + print ÏµúÏÜåÌôî)
# ============================================================
use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8

training_args = build_seq2seq_args(
    output_dir=OUTPUT_DIR,

    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,

    learning_rate=1e-4,
    warmup_ratio=0.06,
    lr_scheduler_type="linear",
    weight_decay=0.01,
    max_grad_norm=1.0,
    label_smoothing_factor=0.1,

    num_train_epochs=5,

    # ‚úÖ ÎäêÎ¶∞ evalÏùÑ ÏûêÏ£º ÌïòÏßÄ ÏïäÎèÑÎ°ù epoch Îã®ÏúÑÎ°ú!
    evaluation_strategy="epoch",
    save_strategy="epoch",

    # ‚úÖ ÌîÑÎ¶∞Ìä∏/Î°úÍ∑∏ ÏµúÏÜåÌôî
    logging_strategy="epoch",   # (ÏßÄÏõê Î≤ÑÏ†ÑÏù¥Î©¥) epochÏóê Ìïú Î≤àÎßå Î°úÍ∑∏
    # logging_steps=999999,      # logging_strategy ÎØ∏ÏßÄÏõêÏù¥Î©¥ Ïù¥ Ï§ÑÏùÑ ÏÇ¨Ïö©

    # ‚úÖ ÌïôÏäµ Ï§ëÏóêÎäî ÏÉùÏÑ±/metric Í≥ÑÏÇ∞ ÏïàÌï® (ÏÜçÎèÑ ÌïµÏã¨)
    predict_with_generate=False,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    remove_unused_columns=False,
    report_to="none",

    fp16=not use_bf16,
    bf16=use_bf16,
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

# ============================================================
# 14) Trainer (compute_metrics=None: ÌïôÏäµ Ï§ë metrics Í≥ÑÏÇ∞ Ï†úÍ±∞)
# ============================================================
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=None,              # ‚úÖ ÌïôÏäµ Ï§ë metric Í≥ÑÏÇ∞ Ï†úÍ±∞
    callbacks=[NanGuardCallback()],    # ‚úÖ NaN ÎÇòÏò§Î©¥ Ï¶âÏãú Ï§ëÎã®
)

print(">>> Starting Training...")
trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(">>> Training done. Best checkpoint loaded (by eval_loss).")

# ============================================================
# 15) Optional: Test generation (120kÎäî Ïò§Îûò Í±∏Î¶¨ÎØÄÎ°ú ÌååÏùºÎ°ú Ï†ÄÏû•)
# ============================================================
if os.path.exists(TEST_PATH):
    test_df = pd.read_json(TEST_PATH).dropna(subset=["input"]).copy()
    test_df.reset_index(drop=True, inplace=True)

    test_dataset = StructureT5Dataset(
        # testÏóê targetÏù¥ ÏóÜÎã§Î©¥ ÎçîÎØ∏Î°ú ÎßåÎì§Ïñ¥ÎèÑ ÎêòÎäîÎç∞,
        # ÌòÑÏû¨ classÎäî targetÏùÑ Ï∞∏Ï°∞ÌïòÎØÄÎ°ú testÏóêÎèÑ target Ïª¨ÎüºÏù¥ ÌïÑÏöîÌï©ÎãàÎã§.
        # ÎßåÏïΩ test jsonÏóê targetÏù¥ ÏóÜÎã§Î©¥ ÏïÑÎûòÏ≤òÎüº ÎçîÎØ∏Î•º Ï∂îÍ∞ÄÌïòÏÑ∏Ïöî:
        test_df.assign(target="."),
        tokenizer,
        header2id
    )

    # ÏÉùÏÑ± ÏÑ§Ï†ï (ÌïÑÏöîÌïòÎ©¥ beam Ï§ÑÏù¥Î©¥ ÏÜçÎèÑ‚Üë)
    model.generation_config.num_beams = 4
    model.generation_config.max_length = 128
    model.generation_config.no_repeat_ngram_size = 3

    # 120k Ï†ÑÎ∂Ä ÏÉùÏÑ±ÏùÄ Ïò§Îûò Í±∏Î¶º. Í∑∏ÎûòÎèÑ ÌïòÎ†§Î©¥ ÏïÑÎûò Ïã§Ìñâ.
    print(">>> Generating TEST predictions (this can be slow for 120k)...")
    preds = trainer.predict(test_dataset, max_length=128, num_beams=4)
    test_preds = tokenizer.batch_decode(preds.predictions, skip_special_tokens=True)
    test_preds = [p.strip() for p in test_preds]

    out_path = os.path.join(OUTPUT_DIR, "test_predictions.txt")
    with open(out_path, "w", encoding="utf-8") as f:
        for p in test_preds:
            f.write(p + "\n")
    print("Saved:", out_path)
else:
    print("[Info] TEST_PATH not found. Skip test generation.")


Device: cuda
>>> Loading data...
Train: 120069 Val: 22293


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

>>> Scanning headers from data...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 120069/120069 [00:01<00:00, 62663.18it/s]


>>> Building Embedding Matrix (first time only)...


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 37333/37333 [00:07<00:00, 5296.87it/s]


>>> Saved header cache: /content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/safe_structure_t5_word_delta_BIGVAL/header_cache.pkl
>>> Starting Training...


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Epoch,Training Loss,Validation Loss
1,5.5026,2.439723
2,4.8921,2.405067
3,4.7446,2.376581
4,4.6555,2.35152
5,4.6005,2.357511


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


>>> Training done. Best checkpoint loaded (by eval_loss).
[Info] TEST_PATH not found. Skip test generation.


In [7]:
import os
import torch

# 1. Í≤ΩÎ°ú Ïû¨ÏÑ§Ï†ï (Ïò§ÌÉÄ Î∞©ÏßÄ)
# Î∞ïÏÇ¨ÎãòÏù¥ ÏÑ§Ï†ïÌïòÏã† Í≤ΩÎ°ú Í∑∏ÎåÄÎ°úÏûÖÎãàÎã§.
SAVE_PATH = "/content/drive/MyDrive/·ÑÜ·Ö•·Ü∫·Ñâ·Ö° NLP ·Ñå·Ö°·Ñã·Öß·Ü´·Ñã·Ö•·Ñé·Ö•·ÑÖ·Öµ/·Ñê·Öµ·Ü∑·Ñë·Ö≥·ÑÖ·Ö©·Ñå·Ö¶·Ü®·Ñê·Ö≥_2"

print(f">>> Ï†ÄÏû• Í≤ΩÎ°ú ÌôïÏù∏: {SAVE_PATH}")
os.makedirs(SAVE_PATH, exist_ok=True)

# 2. Î©îÎ™®Î¶¨Ïóê Î™®Îç∏Ïù¥ ÏûàÎäîÏßÄ ÌôïÏù∏
try:
    if 'trainer' in locals():
        print("‚úÖ TrainerÍ∞Ä Î©îÎ™®Î¶¨Ïóê ÏûàÏäµÎãàÎã§. Î™®Îç∏ÏùÑ Ï†ÄÏû•Ìï©ÎãàÎã§...")
        trainer.save_model(SAVE_PATH)
        tokenizer.save_pretrained(SAVE_PATH)
        print(">>> [ÏÑ±Í≥µ] TrainerÎ•º ÌÜµÌï¥ Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Ï†ÄÏû• ÏôÑÎ£å!")

    elif 'model' in locals() and 'tokenizer' in locals():
        print("‚úÖ TrainerÎäî ÏóÜÏßÄÎßå model Î≥ÄÏàòÍ∞Ä ÏûàÏäµÎãàÎã§. ÏàòÎèô Ï†ÄÏû•Ìï©ÎãàÎã§...")
        model.save_pretrained(SAVE_PATH)
        tokenizer.save_pretrained(SAVE_PATH)
        print(">>> [ÏÑ±Í≥µ] model.save_pretrained()Î°ú Ï†ÄÏû• ÏôÑÎ£å!")

    else:
        print("‚ùå Î©îÎ™®Î¶¨Ïóê Î™®Îç∏ Î≥ÄÏàò(trainer, model)Í∞Ä ÏóÜÏäµÎãàÎã§. Îü∞ÌÉÄÏûÑÏù¥ Ï¥àÍ∏∞ÌôîÎêú Í≤É Í∞ôÏäµÎãàÎã§.")

except Exception as e:
    print(f"‚ùå Ï†ÄÏû• Ï§ë ÏóêÎü¨ Î∞úÏÉù: {e}")

# 3. ÌååÏùºÏù¥ ÏßÑÏßú ÏÉùÍ≤ºÎäîÏßÄ ÎààÏúºÎ°ú ÌôïÏù∏
print("\n>>> üìÇ Ï†ÄÏû•Îêú ÌååÏùº Î™©Î°ù ÌôïÏù∏:")
if os.path.exists(SAVE_PATH):
    files = os.listdir(SAVE_PATH)
    for f in files:
        print(f" - {f}")

    if "config.json" in files and "model.safetensors" in files: # ÎòêÎäî pytorch_model.bin
        print("\nüéâ ÏïàÏã¨ÌïòÏÑ∏Ïöî! Î™®Îç∏ ÌååÏùºÏù¥ Ï†ïÏÉÅÏ†ÅÏúºÎ°ú Î≥¥ÏûÖÎãàÎã§.")
    else:
        print("\n‚ö†Ô∏è Í≤ΩÍ≥†: ÌïµÏã¨ ÌååÏùº(config.json, model.safetensors)Ïù¥ Ïïà Î≥¥ÏûÖÎãàÎã§.")
else:
    print("‚ùå Ìè¥ÎçîÍ∞Ä Ïó¨Ï†ÑÌûà ÎπÑÏñ¥ÏûàÏäµÎãàÎã§.")

>>> Ï†ÄÏû• Í≤ΩÎ°ú ÌôïÏù∏: /content/drive/MyDrive/·ÑÜ·Ö•·Ü∫·Ñâ·Ö° NLP ·Ñå·Ö°·Ñã·Öß·Ü´·Ñã·Ö•·Ñé·Ö•·ÑÖ·Öµ/·Ñê·Öµ·Ü∑·Ñë·Ö≥·ÑÖ·Ö©·Ñå·Ö¶·Ü®·Ñê·Ö≥_2
‚úÖ TrainerÍ∞Ä Î©îÎ™®Î¶¨Ïóê ÏûàÏäµÎãàÎã§. Î™®Îç∏ÏùÑ Ï†ÄÏû•Ìï©ÎãàÎã§...
>>> [ÏÑ±Í≥µ] TrainerÎ•º ÌÜµÌï¥ Î™®Îç∏Í≥º ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Ï†ÄÏû• ÏôÑÎ£å!

>>> üìÇ Ï†ÄÏû•Îêú ÌååÏùº Î™©Î°ù ÌôïÏù∏:
 - totto_data
 - ToTTo_·ÑÇ·Ö©·Ü´·ÑÜ·ÖÆ·Ü´.pdf
 - ToTTo Model Architecture IDEA.gdoc
 - totto_preprocessed_dev.json
 - totto_preprocessed_train.json
 - safe_structure_t5_final
 - safe_structure_t5_sampled
 - safe_structure_t5_word_delta_BIGVAL
 - config.json
 - generation_config.json
 - model.safetensors
 - tokenizer_config.json
 - special_tokens_map.json
 - added_tokens.json
 - spiece.model
 - tokenizer.json
 - training_args.bin

üéâ ÏïàÏã¨ÌïòÏÑ∏Ïöî! Î™®Îç∏ ÌååÏùºÏù¥ Ï†ïÏÉÅÏ†ÅÏúºÎ°ú Î≥¥ÏûÖÎãàÎã§.


In [15]:
from collections import Counter
import numpy as np
import sacrebleu

# =========================================================
# üõ†Ô∏è [ÏàòÏ†ï ÏôÑÎ£å] get_ngrams Ìï®Ïàò (Î≥ÄÏàòÎ™Ö Ï∂©Îèå Ìï¥Í≤∞)
# =========================================================
def get_ngrams(text, max_n=4):
    tokens = text.split()
    ngrams = []
    # k: ÌòÑÏû¨ n-gram Í∏∏Ïù¥ (1~4)
    for k in range(1, max_n + 1):
        # i: ÏãúÏûë ÏúÑÏπò
        for i in range(len(tokens) - k + 1):
            ngrams.append(tuple(tokens[i : i + k]))
    return Counter(ngrams)

# =========================================================
# Ï†êÏàò Í≥ÑÏÇ∞ (predictions, referencesÎäî Ïù¥ÎØ∏ Î©îÎ™®Î¶¨Ïóê ÏûàÏùå)
# =========================================================
print(">>> Ï†êÏàò Í≥ÑÏÇ∞ Ïû¨ÏãúÎèÑ...")

if 'predictions' not in locals() or 'references' not in locals():
    print("‚ùå [Ïò§Î•ò] predictions Î≥ÄÏàòÍ∞Ä ÏóÜÏäµÎãàÎã§. ÏúÑÏ™ΩÏùò ÏÉùÏÑ± ÏΩîÎìúÎ•º Î®ºÏ†Ä Ïã§ÌñâÌñàÎäîÏßÄ ÌôïÏù∏Ìï¥Ï£ºÏÑ∏Ïöî.")
else:
    # 1. BLEU Score
    bleu = sacrebleu.corpus_bleu(predictions, [[r] for r in references])

    # 2. PARENT Score
    parent_scores = []

    # Îç∞Ïù¥ÌÑ∞ Í∞úÏàò ÎßûÏ∂îÍ∏∞ (ÌòπÏãú Î™®Î•º Ïù∏Îç±Ïä§ ÏóêÎü¨ Î∞©ÏßÄ)
    limit = min(len(predictions), len(references), len(val_df))

    # DataFrame ÏàúÌöå
    for i in range(limit):
        pred = predictions[i]
        ref = references[i]
        row = val_df.iloc[i]

        # ÌÖåÏù¥Î∏î ÌÖçÏä§Ìä∏ Ïû¨Íµ¨ÏÑ±
        parsed = parse_totto_string(row['input'])
        table_text = f"{parsed['page']} {parsed['section']} " + " ".join([c['val'] for c in parsed['cells']])

        # N-gram Ï∂îÏ∂ú
        pred_ngrams = get_ngrams(pred)
        ref_ngrams = get_ngrams(ref)
        table_ngrams = get_ngrams(table_text)

        # Precision
        overlap = sum((count for ngram, count in pred_ngrams.items() if ngram in ref_ngrams or ngram in table_ngrams))
        prec = overlap / (sum(pred_ngrams.values()) + 1e-9)

        # Recall
        overlap_ref = sum((count for ngram, count in ref_ngrams.items() if ngram in pred_ngrams))
        rec = overlap_ref / (sum(ref_ngrams.values()) + 1e-9)

        # F1 Score
        f1 = 2 * prec * rec / (prec + rec + 1e-9)
        parent_scores.append(f1)

    parent_f1 = np.mean(parent_scores) * 100

    # ---------------------------------------------------------
    # ÏµúÏ¢Ö Í≤∞Í≥º Ï∂úÎ†•
    # ---------------------------------------------------------
    print("\n" + "="*40)
    print(f"üìä ÏµúÏ¢Ö ÌèâÍ∞Ä Í≤∞Í≥º (Delta Injection Ï†ÅÏö©Îê®)")
    print(f"‚úÖ BLEU Score: {bleu.score:.2f}")
    print(f"‚úÖ PARENT F1:  {parent_f1:.2f}")
    print("="*40)

>>> Ï†êÏàò Í≥ÑÏÇ∞ Ïû¨ÏãúÎèÑ...

üìä ÏµúÏ¢Ö ÌèâÍ∞Ä Í≤∞Í≥º (Delta Injection Ï†ÅÏö©Îê®)
‚úÖ BLEU Score: 29.15
‚úÖ PARENT F1:  36.37
