In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch transformers sacrebleu tqdm sentencepiece

Collecting sacrebleu
  Downloading sacrebleu-2.6.0-py3-none-any.whl.metadata (39 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.6.0-py3-none-any.whl (100 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.8/100.8 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.2.0-py3-none-any.whl (22 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.2.0 sacrebleu-2.6.0


In [3]:
import json
import torch
import collections
import sacrebleu
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ==========================================
# 1. Metric Utility Functions (PARENT & BLEU Helper)
# ==========================================
def get_ngrams(segment, max_order):
    """텍스트에서 n-gram 카운트를 추출합니다."""
    ngram_counts = collections.Counter()
    for order in range(1, max_order + 1):
        for i in range(0, len(segment) - order + 1):
            ngram = tuple(segment[i:i+order])
            ngram_counts[ngram] += 1
    return ngram_counts

def parent_score(predictions, references, tables):
    """
    PARENT 지표 계산 (Precision, Recall, F1)
    * 학술적 정의: PARENT는 생성된 텍스트가 테이블의 내용을 얼마나 정확하게(Precision)
      그리고 빠짐없이(Recall) 반영했는지를 Reference 문장과 Table 값을 모두 고려하여 평가하는 지표입니다.
    """
    total_precision, total_recall, total_f1 = 0.0, 0.0, 0.0
    max_order = 4
    smoothing = 1e-13

    for pred_text, ref_texts, table_texts in zip(predictions, references, tables):
        pred_tokens = pred_text.strip().split()
        ref_tokens_list = [ref.strip().split() for ref in ref_texts]

        table_tokens = []
        for cell_value in table_texts:
            table_tokens.extend(str(cell_value).split())

        pred_ngrams = get_ngrams(pred_tokens, max_order)
        ref_ngrams_list = [get_ngrams(ref, max_order) for ref in ref_tokens_list]
        table_ngrams = get_ngrams(table_tokens, max_order)

        # Precision
        numerator_prec = 0.0
        denominator_prec = sum(pred_ngrams.values()) + smoothing
        for ngram, count in pred_ngrams.items():
            prob_in_table = 1.0 if ngram in table_ngrams else 0.0
            prob_in_ref = 0.0
            for ref_ngrams in ref_ngrams_list:
                prob_in_ref = max(prob_in_ref, min(1.0, ref_ngrams.get(ngram, 0) / count))
            w_prob = prob_in_table + prob_in_ref * (1.0 - prob_in_table)
            numerator_prec += count * w_prob
        precision = numerator_prec / denominator_prec

        # Recall
        numerator_rec = 0.0
        denominator_rec = smoothing
        best_recall = 0.0
        for ref_ngrams in ref_ngrams_list:
            curr_num = 0.0
            curr_denom = smoothing
            for ngram, count in ref_ngrams.items():
                if ngram in table_ngrams:
                    curr_denom += count
                    if ngram in pred_ngrams:
                        curr_num += min(count, pred_ngrams[ngram])
            if curr_denom > smoothing:
                best_recall = max(best_recall, curr_num / curr_denom)
        recall = best_recall

        # F1
        if precision + recall > 0:
            f1 = 2 * precision * recall / (precision + recall)
        else:
            f1 = 0.0

        total_precision += precision
        total_recall += recall
        total_f1 += f1

    n = len(predictions)
    return (total_precision / n), (total_recall / n), (total_f1 / n)

# ==========================================
# 2. Data Loading & Dataset Class
# ==========================================
class EvaluationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        input_text = item.get("input", "")
        model_inputs = self.tokenizer(
            input_text,
            max_length=self.max_input_len,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        return {k: v.squeeze(0) for k, v in model_inputs.items()}

def load_json_data(path):
    data = []
    with open(path, "r", encoding="utf-8") as f:
        if path.endswith(".jsonl"):
            for line in f:
                if line.strip(): data.append(json.loads(line))
        else:
            data = json.load(f)
    return data

# ==========================================
# 3. Main Inference Logic
# ==========================================
def run_inference(ckpt_path, data_path, original_dev_path, device="cuda"):
    print(f"Loading model & tokenizer from: {ckpt_path}")

    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path)
    model.to(device)
    model.eval()

    # 1. 예측 생성 (ID와 함께 저장)
    print("Loading data & Generating predictions...")
    eval_raw_data = load_json_data(data_path)
    eval_dataset = EvaluationDataset(eval_raw_data, tokenizer)
    dataloader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

    pred_dict = {} # {id: prediction_text} 형태로 저장

    # 데이터셋의 순서대로 ID를 가져오기 위해 raw_data 참조
    all_ids = [item['id'] for item in eval_raw_data]
    current_idx = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )
            decoded = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            # 배치 결과 저장
            for text in decoded:
                sample_id = all_ids[current_idx]
                pred_dict[sample_id] = text.strip()
                current_idx += 1

    # 2. Reference 로드 및 정렬 (ID 매칭 핵심 로직)
    print("Loading references & Aligning data...")

    aligned_preds = []
    aligned_refs = []
    aligned_tables = []

    with open(original_dev_path, "r", encoding="utf-8") as f:
        for line in f:
            ex = json.loads(line)
            ex_id = str(ex.get("example_id")) # 원본 데이터의 ID

            # [핵심] 예측값 사전에 해당 ID가 있는 경우에만 평가 리스트에 추가
            if ex_id in pred_dict:
                # 1) Prediction
                aligned_preds.append(pred_dict[ex_id])

                # 2) Reference
                curr_refs = [anno.get("final_sentence", "") for anno in ex.get("sentence_annotations", [])]
                aligned_refs.append(curr_refs)

                # 3) Table (PARENT용)
                t_vals = []
                for row in ex.get("table", []):
                    for cell in row:
                        if cell.get("value"): t_vals.append(str(cell.get("value")))
                aligned_tables.append(t_vals)

    print(f"Aligned {len(aligned_preds)} samples successfully.")

    # 3. 점수 계산
    # BLEU Format Transpose
    max_refs = max(len(r) for r in aligned_refs) if aligned_refs else 0
    transposed_refs = [[] for _ in range(max_refs)]
    for r_list in aligned_refs:
        for i in range(max_refs):
            transposed_refs[i].append(r_list[i] if i < len(r_list) else "")

    bleu_score = sacrebleu.corpus_bleu(aligned_preds, transposed_refs).score

    # PARENT
    p_prec, p_recall, p_f1 = parent_score(aligned_preds, aligned_refs, aligned_tables)

    print("\n" + "="*30)
    print(" Evaluation Results (Fixed)")
    print("="*30)
    print(f" BLEU Score:       {bleu_score:.2f}")
    print(f" PARENT Precision: {p_prec * 100:.2f}")
    print(f" PARENT Recall:    {p_recall * 100:.2f}")
    print(f" PARENT F1:        {p_f1 * 100:.2f}")
    print("="*30)

# ==========================================
# 4. Execution Block
# ==========================================
if __name__ == "__main__":
    # 경로 설정 (사용자 환경에 맞게 수정)
    CHECKPOINT_PATH = "/content/drive/MyDrive/models_T5L/checkpoint-19000"
    PREPROCESSED_DATA_PATH = "/content/drive/MyDrive/totto_dev_LATTICE.json"
    ORIGINAL_DEV_DATA_PATH = "/content/drive/MyDrive/totto_dev_data.jsonl"

    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    run_inference(CHECKPOINT_PATH, PREPROCESSED_DATA_PATH, ORIGINAL_DEV_DATA_PATH, DEVICE)

Loading model & tokenizer from: /content/drive/MyDrive/models_T5L/checkpoint-19000
Loading data & Generating predictions...


Generating: 100%|██████████| 1394/1394 [24:37<00:00,  1.06s/it]


Loading references & Aligning data...
Aligned 7700 samples successfully.

 Evaluation Results (Fixed)
 BLEU Score:       41.07
 PARENT Precision: 47.64
 PARENT Recall:    76.73
 PARENT F1:        54.71


In [5]:
from transformers import AutoTokenizer
import os

# 1. 체크포인트 경로 설정 (사용자 경로에 맞게 수정됨)
CHECKPOINT_PATH = "/content/drive/MyDrive/models_T5L/checkpoint-19000"

# 경로가 실제 존재하는지 먼저 확인
if not os.path.exists(CHECKPOINT_PATH):
    print(f"❌ 오류: 경로를 찾을 수 없습니다 -> {CHECKPOINT_PATH}")
    print("구글 드라이브 마운트가 되어 있는지, 경로 오타가 없는지 확인해주세요.")
else:
    print(f"Loading tokenizer from {CHECKPOINT_PATH}...")

    try:
        # 2. 토크나이저 로드
        tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH)

        # 3. 검증할 토큰 설정
        test_token = "[CLEAN]"

        # 4. 인코딩 수행 (특수 토큰 자동 추가 방지 옵션 필수)
        # add_special_tokens=False를 해야 </s> 같은 종료 토큰이 안 붙어서 정확한 확인이 가능합니다.
        encoded = tokenizer.encode(test_token, add_special_tokens=False)

        print("\n" + "="*30)
        print(" [Tokenizer Check Result] ")
        print("="*30)
        print(f"Target Token : {test_token}")
        print(f"Encoded IDs  : {encoded}")
        print(f"Length       : {len(encoded)}")

        # 5. 결과 판별
        if len(encoded) == 1:
            print(f"✅ 성공! '{test_token}'이(가) ID [{encoded[0]}]인 하나의 토큰으로 완벽하게 인식됩니다.")
            print("   -> 학습된 토크나이저가 정상적으로 로드되었습니다.")
        else:
            print(f"❌ 실패. '{test_token}'이(가) {len(encoded)}개의 조각으로 쪼개졌습니다.")
            print("   -> 체크포인트 폴더에 tokenizer.json, special_tokens_map.json 등이 제대로 저장되지 않았거나")
            print("   -> 기본 t5-base 토크나이저가 로드되었을 가능성이 큽니다.")

    except Exception as e:
        print(f"❌ 토크나이저 로드 중 에러 발생: {e}")

Loading tokenizer from /content/drive/MyDrive/models_T5L/checkpoint-19000...

 [Tokenizer Check Result] 
Target Token : [CLEAN]
Encoded IDs  : [32100]
Length       : 1
✅ 성공! '[CLEAN]'이(가) ID [32100]인 하나의 토큰으로 완벽하게 인식됩니다.
   -> 학습된 토크나이저가 정상적으로 로드되었습니다.


In [7]:
import json
import torch
import random
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

def inspect_random_samples(ckpt_path, input_path, ref_path, num_samples=5, seed=42, device="cuda"):
    """
    input_path: 모델 입력용 전처리 파일 (예: totto_dev_LATTICE.json)
    ref_path:   정답 확인용 원본 파일 (예: totto_dev_data.jsonl)
    seed:       매번 같은 랜덤 결과를 보고 싶으면 고정, 아니면 None
    """
    # 랜덤 시드 설정 (재현성을 위해 기본값 42, 매번 다르게 하려면 None으로 설정)
    if seed is not None:
        random.seed(seed)

    print(f"Loading model from: {ckpt_path}")
    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_path)
    model.to(device)
    model.eval()

    # 1. 입력 데이터 전체 로드 (Preprocessed)
    print(f"Loading inputs from: {input_path}")
    inputs_data = []
    with open(input_path, "r", encoding="utf-8") as f:
        if input_path.endswith(".jsonl"):
            for line in f:
                if line.strip(): inputs_data.append(json.loads(line))
        else:
            inputs_data = json.load(f)

    # [수정된 부분] 전체 데이터에서 무작위로 num_samples 만큼 뽑기
    total_len = len(inputs_data)
    if total_len > num_samples:
        sampled_items = random.sample(inputs_data, num_samples)
        print(f"Randomly selected {num_samples} samples from {total_len} total examples.")
    else:
        sampled_items = inputs_data
        print(f"Data size ({total_len}) is smaller than requested samples. Using all data.")

    # 2. 참조 데이터 로드 (Original) - 딕셔너리 변환
    print(f"Loading references from: {ref_path}")
    ref_map = {}
    with open(ref_path, "r", encoding="utf-8") as f:
        for line in f:
            item = json.loads(line)
            ex_id = str(item.get("example_id") or item.get("id"))

            refs = []
            if "sentence_annotations" in item:
                refs = [ann["final_sentence"] for ann in item["sentence_annotations"]]

            if refs:
                ref_map[ex_id] = refs

    print(f"\n{'='*20} Generating Outputs (Random {num_samples}) {'='*20}\n")

    # 3. 매칭 및 생성 (샘플링된 데이터에 대해서만 수행)
    for item in sampled_items:
        # 입력 파일에서의 ID와 Input Text
        ex_id = str(item.get("example_id") or item.get("id"))
        input_text = item.get("input") or item.get("source") or ""

        # 원본 파일에서 Reference 찾기
        references = ref_map.get(ex_id, [])

        # 토크나이징 & 생성
        model_inputs = tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(
                **model_inputs,
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 출력
        print(f"[Sample ID: {ex_id}]")
        print(f"▶ Input:      {input_text[:120]} ... (생략)")
        print(f"▶ Prediction: \033[94m{prediction}\033[0m")
        print(f"▶ References: {references}")
        print("-" * 60)

# ==========================================
# 실행 설정
# ==========================================
if __name__ == "__main__":
    CKPT_PATH = "/content/drive/MyDrive/models_T5L/checkpoint-19000"
    INPUT_DATA_PATH = "/content/drive/MyDrive/totto_dev_LATTICE.json"
    REF_DATA_PATH = "/content/drive/MyDrive/totto_dev_data.jsonl"
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # seed=None으로 설정하면 실행할 때마다 매번 다른 샘플이 나옵니다.
    # seed=42 처럼 숫자를 넣으면 항상 고정된 랜덤 샘플이 나옵니다.
    inspect_random_samples(
        CKPT_PATH,
        INPUT_DATA_PATH,
        REF_DATA_PATH,
        num_samples=5,
        seed=None,   # <--- 여기를 None으로 하면 완전 랜덤
        device=DEVICE
    )

Loading model from: /content/drive/MyDrive/models_T5L/checkpoint-19000
Loading inputs from: /content/drive/MyDrive/totto_dev_LATTICE.json
Randomly selected 5 samples from 22293 total examples.
Loading references from: /content/drive/MyDrive/totto_dev_data.jsonl


[Sample ID: 4709928858037909194]
▶ Input:      [CLEAN] [PAGE] F♯ (musical note) [SEC] Designation by octave [CELL] F♯4 [TYPE] F [R_HEAD] None [C_HEAD] Scientific desig ... (생략)
▶ Prediction: [94mThe F4 has a frequency of 369.994 Hz.[0m
▶ References: ['The frequency of F♯₄ is 369.994 Hz.', 'The frequency of F♯₄ is 369.994 Hz.']
------------------------------------------------------------
[Sample ID: -530749260696504495]
▶ Input:      [CLEAN] [PAGE] North Chevy Chase, Maryland [SEC] Demographics [CELL] 2010 [TYPE] F [R_HEAD] None [C_HEAD] Historical pop ... (생략)
▶ Prediction: [94mThe population of North Chevy Chase was 519 at the 2010 census.[0m
▶ References: ['The population was 519 at the 2010 census in North Chevy Chase, 