In [None]:
!pip install evaluate sacrebleu
import json
import re
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
from tqdm import tqdm
import sacrebleu

from transformers import (
    AutoTokenizer,
    T5ForConditionalGeneration,
    T5Config,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)

# ==========================================
# 0. ÌôòÍ≤Ω ÏÑ§Ï†ï Î∞è ÏãúÎìú Í≥†Ï†ï
# ==========================================
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f">>> ÌôòÍ≤Ω ÏÑ§Ï†ï ÏôÑÎ£å. Device: {device}")

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m100.8/100.8 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl 

In [None]:
# ==========================================
# 1. ÌèâÍ∞Ä ÏßÄÌëú Ìï®Ïàò (PARENT & BLEU)
# ==========================================
def get_ngrams(segment, max_order):
    ngram_counts = Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1
    return ngram_counts

def parent_score(predictions, references, tables):
    """
    ToTTo Î¶¨ÎçîÎ≥¥ÎìúÏö© PARENT Ïä§ÏΩîÏñ¥ Í≥ÑÏÇ∞
    """
    total_precision, total_recall, total_f1 = 0.0, 0.0, 0.0
    max_order = 4
    smoothing = 1e-13

    for pred, refs, table in zip(predictions, references, tables):
        pred_tokens = pred.strip().split()
        ref_tokens_list = [r.strip().split() for r in refs]
        table_tokens = [str(t) for t in table]

        pred_ngrams = get_ngrams(pred_tokens, max_order)
        ref_ngrams_list = [get_ngrams(ref, max_order) for ref in ref_tokens_list]
        table_ngrams = get_ngrams(table_tokens, max_order)

        # Precision
        numerator_prec = 0.0
        denominator_prec = sum(pred_ngrams.values()) + smoothing
        for ngram, count in pred_ngrams.items():
            prob_in_table = 1.0 if ngram in table_ngrams else 0.0
            prob_in_ref = 0.0
            for ref_ngrams in ref_ngrams_list:
                prob_in_ref = max(prob_in_ref, min(1.0, ref_ngrams.get(ngram, 0) / count))
            w_prob = prob_in_table + prob_in_ref * (1.0 - prob_in_table)
            numerator_prec += count * w_prob
        precision = numerator_prec / denominator_prec

        # Recall
        best_recall = 0.0
        for ref_ngrams in ref_ngrams_list:
            numerator_rec = 0.0
            denominator_rec = sum(ref_ngrams.values()) + smoothing
            for ngram, count in ref_ngrams.items():
                if ngram in table_ngrams:
                     if ngram in pred_ngrams:
                        numerator_rec += min(count, pred_ngrams[ngram])
            best_recall = max(best_recall, numerator_rec / denominator_rec)
        recall = best_recall

        f1 = 0.0
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)

        total_precision += precision
        total_recall += recall
        total_f1 += f1

    n = len(predictions)
    return total_precision/n, total_recall/n, total_f1/n

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple): preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # ÌïôÏäµ Ï§ëÏóêÎäî Îπ†Î•¥Í≥† Í∞ÑÎã®Ìïú BLEUÎßå ÌôïÏù∏
    result = sacrebleu.corpus_bleu(decoded_preds, decoded_labels)
    return {"bleu": result.score}

In [None]:
# ==========================================
# 2. Î™®Îç∏ Ï†ïÏùò: SafeStructureT5 (Concat + Identity Init)
# ==========================================
class SafeStructureT5(T5ForConditionalGeneration):
    def __init__(self, config, header_init_matrix=None):
        super().__init__(config)

        # 1. Ìó§Îçî ÏûÑÎ≤†Îî© (padding_idx=0ÏúºÎ°ú ÏÑ§Ï†ïÌïòÏó¨ NoneÍ∞í Ï≤òÎ¶¨)
        if header_init_matrix is not None:
            self.row_header_embeddings = nn.Embedding.from_pretrained(header_init_matrix, freeze=False, padding_idx=0)
            self.col_header_embeddings = nn.Embedding.from_pretrained(header_init_matrix, freeze=False, padding_idx=0)
        else:
            self.row_header_embeddings = nn.Embedding(2048, config.d_model, padding_idx=0)
            self.col_header_embeddings = nn.Embedding(2048, config.d_model, padding_idx=0)

        # 2. Ï∞®Ïõê Ï∂ïÏÜå (Concat: Word + Row + Col -> Original Dim)
        self.feature_projection = nn.Linear(config.d_model * 3, config.d_model, bias=False)
        self.norm = nn.LayerNorm(config.d_model) # ÌïôÏäµ ÏïàÏ†ïÏÑ±ÏùÑ ÏúÑÌïú Ï†ïÍ∑úÌôî

        # 3. [ÌïµÏã¨] Identity Initialization (ÌïôÏäµ Ï¥àÍ∏∞ Highlighted Cell Î≥¥Ìò∏)
        self._init_projection_weights()

    def _init_projection_weights(self):
        hidden_dim = self.config.d_model
        with torch.no_grad():
            # Îã®Ïñ¥(Word) Ï†ïÎ≥¥Îäî 100% ÌÜµÍ≥º (Identity Matrix)
            self.feature_projection.weight[:, :hidden_dim] = torch.eye(hidden_dim)
            # Íµ¨Ï°∞(Structure) Ï†ïÎ≥¥Îäî 0% ÌÜµÍ≥º (Ï∞®Îã®) - ÌïôÏäµÌïòÎ©∞ Ï†êÏßÑÏ†Å Î∞òÏòÅ
            self.feature_projection.weight[:, hidden_dim:] = 0.0

    def forward(self, input_ids=None, attention_mask=None, labels=None, row_ids=None, col_ids=None, **kwargs):
        # [Safe Guard 1] Transformers ÏµúÏã† Î≤ÑÏ†Ñ Ìò∏ÌôòÏÑ± (ÏóêÎü¨ Î∞©ÏßÄ)
        kwargs.pop("num_items_in_batch", None)

        # [Safe Guard 2] Generation Îì±ÏóêÏÑú input_idsÍ∞Ä ÏóÜÏùÑ Í≤ΩÏö∞
        if input_ids is None:
             return super().forward(input_ids=None, attention_mask=attention_mask, labels=labels, **kwargs)

        word_embeddings = self.shared(input_ids)

        # [Safe Guard 3] row_ids/col_idsÍ∞Ä ÏóÜÏùÑ Í≤ΩÏö∞ 0 Î≤°ÌÑ∞ Ï≤òÎ¶¨
        if row_ids is not None and col_ids is not None:
            row_embeddings = self.row_header_embeddings(row_ids)
            col_embeddings = self.col_header_embeddings(col_ids)
        else:
            row_embeddings = torch.zeros_like(word_embeddings)
            col_embeddings = torch.zeros_like(word_embeddings)

        # [ÌïµÏã¨ Î°úÏßÅ] Concat (ÏòÜÏúºÎ°ú Î∂ôÏù¥Í∏∞)
        fused_embeddings = torch.cat([word_embeddings, row_embeddings, col_embeddings], dim=-1)

        # Projection & Norm
        fused_embeddings = self.feature_projection(fused_embeddings)
        fused_embeddings = self.norm(fused_embeddings)

        return super().forward(
            inputs_embeds=fused_embeddings, # input_ids ÎåÄÏã† ÏûÑÎ≤†Îî© Ï†ÑÎã¨
            attention_mask=attention_mask,
            labels=labels,
            **kwargs
        )

In [None]:
# ==========================================
# 3. Îç∞Ïù¥ÌÑ∞ Ï≤òÎ¶¨ Ïú†Ìã∏Î¶¨Ìã∞
# ==========================================
def parse_totto_string(input_str):
    # Ï†ïÍ∑úÏãùÏúºÎ°ú Î©îÌÉÄÎç∞Ïù¥ÌÑ∞ÏôÄ ÏÖÄ Ï†ïÎ≥¥ ÌååÏã±
    page_match = re.search(r'\[PAGE\](.*?)(?=\[SEC\]|\[TEXT\]|\[CELL\]|$)', input_str)
    sec_match = re.search(r'\[SEC\](.*?)(?=\[TEXT\]|\[CELL\]|$)', input_str)

    page_title = page_match.group(1).strip() if page_match else ""
    section_title = sec_match.group(1).strip() if sec_match else ""

    # Cell ÌååÏã±: [CELL] Í∞í [TYPE] ... [R_HEAD] Í∞í [C_HEAD] Í∞í
    cell_pattern = re.compile(r'\[CELL\](.*?)\[TYPE\].*?\[R_HEAD\](.*?)\[C_HEAD\](.*?)(?=\[CELL\]|$)')
    cells = []
    for match in cell_pattern.finditer(input_str):
        cells.append({
            'val': match.group(1).strip(),
            'row': match.group(2).strip(),
            'col': match.group(3).strip()
        })
    return {'page': page_title, 'section': section_title, 'cells': cells}

def prepare_semantic_headers(df, base_model_name="t5-base"):
    print(">>> Ìó§Îçî Ï†ïÎ≥¥ Ïä§Ï∫î Î∞è ÏûÑÎ≤†Îî© Îß§Ìä∏Î¶≠Ïä§ ÏÉùÏÑ± Ï§ë...")
    unique_headers = set(["None"])

    # ÏÉòÌîå Îç∞Ïù¥ÌÑ∞ÏóêÏÑú Ìó§Îçî ÏàòÏßë
    for text in tqdm(df['input']):
        parsed = parse_totto_string(text)
        for c in parsed['cells']:
            unique_headers.add(c['row'])
            unique_headers.add(c['col'])

    sorted_headers = ["None"] + sorted([h for h in unique_headers if h != "None"])
    header2id = {h: i for i, h in enumerate(sorted_headers)}

    # Base T5 Î™®Îç∏ÏùÑ Ïù¥Ïö©Ìï¥ Ìó§Îçî ÌÖçÏä§Ìä∏Î•º Î≤°ÌÑ∞Î°ú Î≥ÄÌôò (Semantic Initialization)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    temp_model = T5ForConditionalGeneration.from_pretrained(base_model_name)
    wte = temp_model.get_input_embeddings()

    init_matrix = torch.zeros((len(sorted_headers), temp_model.config.d_model))

    print(">>> ÏùòÎØ∏Î°†Ï†Å Ìó§Îçî ÏûÑÎ≤†Îî© Ï¥àÍ∏∞Ìôî Ï§ë...")
    for i, h in enumerate(tqdm(sorted_headers)):
        if i == 0: continue # NoneÏùÄ 0 Î≤°ÌÑ∞ Ïú†ÏßÄ
        ids = tokenizer.encode(h, add_special_tokens=False)
        if not ids: continue
        with torch.no_grad():
            vecs = wte(torch.tensor([ids]))
            init_matrix[i] = torch.mean(vecs, dim=1).squeeze(0)

    return header2id, init_matrix

class StructureT5Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, header2id, max_len=512):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.header2id = header2id
        self.max_len = max_len
        self.hl_start = tokenizer.convert_tokens_to_ids("<hl>")
        self.hl_end = tokenizer.convert_tokens_to_ids("</hl>")

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]
        parsed = parse_totto_string(item['input'])

        # ÏûÖÎ†• ÌÖçÏä§Ìä∏ Íµ¨ÏÑ±: Context + <hl> Highlighted Cells </hl>
        context = f"Page: {parsed['page']} | Section: {parsed['section']} "
        cell_texts = []
        map_list = []

        for c in parsed['cells']:
            cell_texts.append(f"<hl> {c['val']} </hl>")
            # Ìó§Îçî ÌÖçÏä§Ìä∏Î•º IDÎ°ú Î≥ÄÌôò (ÏóÜÏúºÎ©¥ 0)
            map_list.append((self.header2id.get(c['row'], 0), self.header2id.get(c['col'], 0)))

        full_text = context + " ".join(cell_texts)

        # ÌÜ†ÌÅ∞Ìôî
        tokenized = self.tokenizer(full_text, max_length=self.max_len, padding="max_length", truncation=True, return_tensors="pt")
        input_ids = tokenized['input_ids'].squeeze(0)

        # Íµ¨Ï°∞ Ï†ïÎ≥¥(Row/Col ID) Îß§Ìïë
        row_ids = torch.zeros_like(input_ids)
        col_ids = torch.zeros_like(input_ids)

        cell_idx = 0
        in_hl = False
        for i, tid in enumerate(input_ids):
            if tid == self.hl_start:
                in_hl = True; continue
            if tid == self.hl_end:
                in_hl = False; cell_idx += 1; continue

            # <hl> ÌÉúÍ∑∏ ÎÇ¥Î∂ÄÏóê ÏûàÎäî ÌÜ†ÌÅ∞ÏóêÎßå Ìó§Îçî Ï†ïÎ≥¥ Î∂ÄÏó¨
            if in_hl and cell_idx < len(map_list):
                row_ids[i] = map_list[cell_idx][0]
                col_ids[i] = map_list[cell_idx][1]

        # ÎùºÎ≤® Ï≤òÎ¶¨
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(item['target'], max_length=128, padding="max_length", truncation=True, return_tensors="pt")['input_ids'].squeeze(0)
            labels[labels == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": tokenized['attention_mask'].squeeze(0),
            "row_ids": row_ids,
            "col_ids": col_ids,
            "labels": labels
        }

In [None]:
# ==========================================
# 4. Ïã§Ìñâ ÌååÏù¥ÌîÑÎùºÏù∏ (Main)
# ==========================================

# [Í≤ΩÎ°ú ÏÑ§Ï†ï]
TRAIN_PATH = '/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/totto_data/totto_preprocessed_train.json'
VAL_PATH = '/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/totto_data/totto_preprocessed_dev.json'
OUTPUT_DIR = '/content/drive/MyDrive/Î©ãÏÇ¨ NLP ÏûêÏó∞Ïñ¥Ï≤òÎ¶¨/ÌåÄÌîÑÎ°úÏ†ùÌä∏_2/safe_structure_t5_final'

print(">>> 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è ÏÉòÌîåÎßÅ...")
train_df = pd.read_json(TRAIN_PATH)
val_df = pd.read_json(VAL_PATH)

# [ÏöîÏ≤≠ÏÇ¨Ìï≠ Î∞òÏòÅ] ÌïôÏäµ 120,000Í∞ú, Í≤ÄÏ¶ù 7,700Í∞ú ÏÉòÌîåÎßÅ
train_df = train_df.sample(n=min(120000, len(train_df)), random_state=42)
val_df = val_df.sample(n=min(7700, len(val_df)), random_state=42)
print(f"    Train Samples: {len(train_df)}")
print(f"    Val Samples:   {len(val_df)}")

# Ìó§Îçî Ï¥àÍ∏∞Ìôî Îß§Ìä∏Î¶≠Ïä§ ÏÉùÏÑ±
header2id, init_matrix = prepare_semantic_headers(train_df)

# ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Î∞è Î™®Îç∏ ÏÑ§Ï†ï
print(">>> 2. Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Ï¥àÍ∏∞Ìôî...")
tokenizer = AutoTokenizer.from_pretrained("t5-base")
tokenizer.add_special_tokens({'additional_special_tokens': ['<hl>', '</hl>']})

config = T5Config.from_pretrained("t5-base")
model = SafeStructureT5.from_pretrained("t5-base", config=config, header_init_matrix=init_matrix)
model.resize_token_embeddings(len(tokenizer)) # ÌäπÏàò ÌÜ†ÌÅ∞ Ï∂îÍ∞Ä Î∞òÏòÅ

# Îç∞Ïù¥ÌÑ∞ÏÖã ÏÉùÏÑ±
train_dataset = StructureT5Dataset(train_df, tokenizer, header2id)
val_dataset = StructureT5Dataset(val_df, tokenizer, header2id)

# ÌïôÏäµ Ïù∏Ïûê ÏÑ§Ï†ï
print(">>> 3. Trainer ÏÑ§Ï†ï...")
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=1e-4,
    num_train_epochs=3,
    logging_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    predict_with_generate=True,
    load_best_model_at_end=True,
    metric_for_best_model="bleu",
    greater_is_better=True,
    remove_unused_columns=False,        # [Ï§ëÏöî] row_ids, col_ids Ï†ÑÎã¨ÏùÑ ÏúÑÌï¥ ÌïÑÏàò
    bf16=True,                          # A100 ÏÇ¨Ïö© Ïãú True Í∂åÏû•
    report_to="none"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100),
    compute_metrics=compute_metrics
)

# ÌïôÏäµ ÏãúÏûë
print(">>> 4. ÌïôÏäµ ÏãúÏûë!")
trainer.train()

# Î™®Îç∏ Ï†ÄÏû•
print(f">>> 5. Î™®Îç∏ Ï†ÄÏû•: {OUTPUT_DIR}")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

>>> 1. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è ÏÉòÌîåÎßÅ...
    Train Samples: 120000
    Val Samples:   7700
>>> Ìó§Îçî Ï†ïÎ≥¥ Ïä§Ï∫î Î∞è ÏûÑÎ≤†Îî© Îß§Ìä∏Î¶≠Ïä§ ÏÉùÏÑ± Ï§ë...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 120000/120000 [00:01<00:00, 79244.93it/s]


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

>>> ÏùòÎØ∏Î°†Ï†Å Ìó§Îçî ÏûÑÎ≤†Îî© Ï¥àÍ∏∞Ìôî Ï§ë...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 37322/37322 [00:07<00:00, 4818.24it/s]


>>> 2. Î™®Îç∏ Î∞è ÌÜ†ÌÅ¨ÎÇòÏù¥Ï†Ä Ï¥àÍ∏∞Ìôî...


Some weights of SafeStructureT5 were not initialized from the model checkpoint at t5-base and are newly initialized: ['col_header_embeddings.weight', 'feature_projection.weight', 'norm.bias', 'norm.weight', 'row_header_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>>> 3. Trainer ÏÑ§Ï†ï...
>>> 4. ÌïôÏäµ ÏãúÏûë!


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss,Bleu
500,15.5642,3.513478,11.478744
1000,14.7388,3.43151,11.478744
1500,14.4693,3.384413,9.287529
2000,14.2718,3.351873,11.478744
2500,14.1184,3.324115,11.478744
3000,14.0009,3.302693,11.478744
3500,13.919,3.286271,0.0
4000,13.761,3.272782,9.287529
4500,13.6259,3.261792,9.287529
5000,13.5663,3.249663,9.287529




Step,Training Loss,Validation Loss,Bleu
500,15.5642,3.513478,11.478744
1000,14.7388,3.43151,11.478744
1500,14.4693,3.384413,9.287529
2000,14.2718,3.351873,11.478744
2500,14.1184,3.324115,11.478744
3000,14.0009,3.302693,11.478744
3500,13.919,3.286271,0.0
4000,13.761,3.272782,9.287529
4500,13.6259,3.261792,9.287529
5000,13.5663,3.249663,9.287529




KeyboardInterrupt: 

In [None]:
# ==========================================
# 5. ÏµúÏ¢Ö ÌèâÍ∞Ä (PARENT Score & Visualization)
# ==========================================
print("\n>>> 6. ÏµúÏ¢Ö ÌèâÍ∞Ä (PARENT Score Í≥ÑÏÇ∞)...")

# ÏòàÏ∏° ÏÉùÏÑ±
preds_output = trainer.predict(val_dataset)
decoded_preds = tokenizer.batch_decode(preds_output.predictions, skip_special_tokens=True)
decoded_preds = [p.strip() for p in decoded_preds]

# PARENT Í≥ÑÏÇ∞ÏùÑ ÏúÑÌïú Ï∞∏Ï°∞ Îç∞Ïù¥ÌÑ∞ Ï§ÄÎπÑ
references = []
tables = []
for idx, row in tqdm(val_df.iterrows(), total=len(val_df)):
    references.append([row['target'].strip()])
    parsed = parse_totto_string(row['input'])
    table_tokens = []
    table_tokens.extend(parsed['page'].split())
    table_tokens.extend(parsed['section'].split())
    for cell in parsed['cells']:
        table_tokens.extend(cell['val'].split())
    tables.append(table_tokens)

precision, recall, f1 = parent_score(decoded_preds, references, tables)

print("\n" + "="*40)
print(f"üìä FINAL TEST RESULTS (Sampled Val)")
print(f"   PARENT Precision: {precision:.4f}")
print(f"   PARENT Recall:    {recall:.4f}")
print(f"   PARENT F1:        {f1:.4f}")
print("="*40)

# Í≤∞Í≥º ÏÉòÌîå ÌôïÏù∏
print("\n>>> üé≤ ÏÉùÏÑ± Í≤∞Í≥º ÏÉòÌîå ÌôïÏù∏:")
indices = random.sample(range(len(decoded_preds)), 3)
for i, idx in enumerate(indices):
    print(f"\n[Sample {i+1}]")
    print(f"‚ñ∂ Reference:  {references[idx][0]}")
    print(f"‚ñ∂ Prediction: {decoded_preds[idx]}")
    print("-" * 50)

# Í≤∞Í≥º ÌååÏùº Ï†ÄÏû•
with open(f"{OUTPUT_DIR}/final_results.json", "w") as f:
    json.dump({
        "parent_metrics": {"precision": precision, "recall": recall, "f1": f1},
        "samples": [{"ref": references[i][0], "pred": decoded_preds[i]} for i in indices]
    }, f, indent=2)

print(">>> Î™®Îì† ÏûëÏóÖ ÏôÑÎ£å! ÏàòÍ≥†ÌïòÏÖ®ÏäµÎãàÎã§. üëç")

In [None]:
# Í≤∞Í≥º ÏÉòÌîå ÌôïÏù∏
print("\n>>> üé≤ ÏÉùÏÑ± Í≤∞Í≥º ÏÉòÌîå ÌôïÏù∏:")
indices = random.sample(range(len(decoded_preds)), 3)  # <--- Ïó¨Í∏∞ Ïà´Ïûê '3'