<a href="https://colab.research.google.com/github/Cheeyoung-Yoon/upstage_test/blob/main/transfomer_model_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pickle as pickle
import os
import pandas as pd
import torch


class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset 구성을 위한 class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

def preprocessing_dataset(dataset):
  """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
  subject_entity = []
  object_entity = []
  for i,j in zip(dataset['subject_entity'], dataset['object_entity']):
    i = i[1:-1].split(',')[0].split(':')[1]
    j = j[1:-1].split(',')[0].split(':')[1]

    subject_entity.append(i)
    object_entity.append(j)
  out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':subject_entity,'object_entity':object_entity,'label':dataset['label'],})
  return out_dataset

def load_data(dataset_dir):
  """ csv 파일을 경로에 맡게 불러 옵니다. """
  pd_dataset = pd.read_csv(dataset_dir)
  dataset = preprocessing_dataset(pd_dataset)

  return dataset


def tokenized_dataset(dataset, tokenizer, use_type_markers=True, use_unk=True, max_len=256):
    """
    dataset: pandas.DataFrame with columns:
      - sentence
      - subject_entity, object_entity  (dict-like str: {'word':..., 'type':...})
    """
    import ast

    def parse_ent(e):
        if isinstance(e, str):
            try:
                e = ast.literal_eval(e)
            except:
                return None, None
        if isinstance(e, dict):
            return e.get("word"), e.get("type")
        return None, None

    enc_inputs, enc_texts = [], []

    for s_ent, o_ent, sent in zip(dataset['subject_entity'], dataset['object_entity'], dataset['sentence']):
        s_word, s_type = parse_ent(s_ent)
        o_word, o_type = parse_ent(o_ent)

        # 단어가 누락된 경우 안전장치
        s_word = s_word if s_word else "<SUBJ>"
        o_word = o_word if o_word else "<OBJ>"

        if use_type_markers:
            if not s_type and use_unk: s_type = "UNK"
            if not o_type and use_unk: o_type = "UNK"

            if s_type and o_type:
                e_span = f"[E1-{s_type}]{s_word}[/E1] [E2-{o_type}]{o_word}[/E2]"
            else:
                # 타입을 전혀 모르면 타입 없는 일반 마커 사용
                e_span = f"[E1]{s_word}[/E1] [E2]{o_word}[/E2]"
        else:
            # 타입 마커 비활성화: 일반 마커만
            e_span = f"[E1]{s_word}[/E1] [E2]{o_word}[/E2]"

        enc_inputs.append(e_span)
        enc_texts.append(sent)

    # 필요 시 특수 토큰 등록 (한 번만 실행)
    # 타입 마커/일반 마커/종료 마커 + UNK
    special_tokens = {"additional_special_tokens": [
        "[E1]","[/E1]","[E2]","[/E2]",
        "[E1-PER]","[E2-PER]","[E1-ORG]","[E2-ORG]",
        "[E1-LOC]","[E2-LOC]","[E1-UNK]","[E2-UNK]"
    ]}
    num_added = tokenizer.add_special_tokens(special_tokens)
    # model.resize_token_embeddings(len(tokenizer))  # 모델 로드 후 1회 실행

    return tokenizer(
        enc_inputs,
        enc_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_len,
        add_special_tokens=True,
    )



In [2]:
import os, pickle, numpy as np, pandas as pd, torch, sklearn
from dataclasses import dataclass
from typing import List, Optional, Dict, Any
from sklearn.metrics import accuracy_score
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    Trainer, TrainingArguments
)
from transformers import EarlyStoppingCallback

# ===== Metrics (model-agnostic, dynamic num_labels) =====
def micro_f1_wo_no_relation(preds, labels, label_list: List[str], no_rel: str = "no_relation"):
    no_rel_idx = label_list.index(no_rel)
    use_labels = list(range(len(label_list)))
    use_labels.remove(no_rel_idx)
    return sklearn.metrics.f1_score(labels, preds, average="micro", labels=use_labels) * 100.0

def auprc_all(probs, labels, num_labels: int):
    labels_oh = np.eye(num_labels)[labels]
    score = np.zeros((num_labels,), dtype=np.float32)
    for c in range(num_labels):
        targets_c = labels_oh[:, c]
        preds_c = probs[:, c]
        p, r, _ = sklearn.metrics.precision_recall_curve(targets_c, preds_c)
        score[c] = sklearn.metrics.auc(r, p)
    return float(np.mean(score) * 100.0)

def make_compute_metrics(label_list: List[str], no_rel: str = "no_relation"):
    num_labels = len(label_list)
    def _compute(eval_pred):
        logits = eval_pred.predictions
        probs  = logits if logits.ndim == 2 else logits[0]
        preds  = probs.argmax(-1)
        labels = eval_pred.label_ids
        return {
            "micro f1 score": micro_f1_wo_no_relation(preds, labels, label_list, no_rel),
            "auprc": auprc_all(probs, labels, num_labels),
            "accuracy": accuracy_score(labels, preds),
        }
    return _compute

# ===== Config =====
DEFAULT_LABEL_LIST = [
    'no_relation', 'org:top_members/employees', 'org:members', 'org:product', 'per:title',
    'org:alternate_names', 'per:employee_of', 'org:place_of_headquarters', 'per:product',
    'org:number_of_employees/members', 'per:children', 'per:place_of_residence',
    'per:alternate_names', 'per:other_family', 'per:colleagues', 'per:origin',
    'per:siblings', 'per:spouse', 'org:founded', 'org:political/religious_affiliation',
    'org:member_of', 'per:parents', 'org:dissolved', 'per:schools_attended',
    'per:date_of_death', 'per:date_of_birth', 'per:place_of_birth', 'per:place_of_death',
    'org:founded_by', 'per:religion'
]

@dataclass
class TrainConfig:
    model_name: str = "klue/bert-base"          # BERT / RoBERTa / ELECTRA 모두 OK
    output_dir: str = "./results"
    num_train_epochs: int = 10
    learning_rate: float = 5e-5
    per_device_train_batch_size: int = 16
    per_device_eval_batch_size: int = 16
    warmup_steps: int = 500
    weight_decay: float = 0.01
    logging_steps: int = 100
    save_steps: int = 500
    eval_steps: int = 500
    save_total_limit: int = 5
    load_best_model_at_end: bool = True
    seed: int = 42
    max_length: int = 256
    fp16: bool = False                         # True로 주면 A100/3090 등에서 mixed precision
    special_tokens: Optional[List[str]] = None # 예: ["[E1]","[/E1]","[E2]","[/E2]"]


# ===== Main train function =====
def train_re(
    train_csv: str,
    dev_csv: Optional[str] = None,
    label_list: List[str] = DEFAULT_LABEL_LIST,
    cfg: TrainConfig = TrainConfig(),
    label_map_path: str = 'dict_label_to_num.pkl',
    save_best_to: str = "./best_model",
):
    torch.manual_seed(cfg.seed)
    np.random.seed(cfg.seed)

    # ---- 1) Tokenizer / Model (순서 중요) ----
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)

    # [FIX] 마커 특수토큰을 먼저 추가
    added = 0
    if cfg.special_tokens:
        added = tokenizer.add_special_tokens({"additional_special_tokens": cfg.special_tokens})
        if added > 0:
            print(f"[info] added {added} special tokens")

    num_labels = len(label_list)
    model_config = AutoConfig.from_pretrained(cfg.model_name, num_labels=num_labels)
    model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, config=model_config)

    # [FIX] 특수토큰 추가했으면 반드시 임베딩 리사이즈
    if added > 0:
        model.resize_token_embeddings(len(tokenizer))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # ---- 2) Load data & label mapping ----
    train_df = load_data(train_csv)
    label_map = {label: idx for idx, label in enumerate(label_list)}

    # [FIX] 라벨이 리스트 밖이면 KeyError 방지
    try:
        train_y = [label_map[v] for v in train_df['label'].values]
    except KeyError as e:
        missing = set(train_df['label'].unique()) - set(label_list)
        raise ValueError(f"Found labels not in label_list: {missing}") from e

    # ---- 3) Tokenize (최종 tokenizer로!) ----
    tokenized_train = tokenized_dataset(train_df, tokenizer)
    # [FIX] RoBERTa 호환: token_type_ids 제거(있으면)
    if isinstance(tokenized_train, dict):
        tokenized_train.pop("token_type_ids", None)

    tokenized_train.pop("token_type_ids", None)

    with torch.no_grad():
        emb = model.get_input_embeddings()
        vocab_size = emb.weight.size(0)
        max_id = int(tokenized_train["input_ids"].max().item())
        print(f"[check] vocab_size={vocab_size}, max_input_id={max_id}")

        if max_id >= vocab_size:
            # 디버그: 어떤 토큰들이 범위를 넘는지 확인
            ids = tokenized_train["input_ids"].view(-1)
            bad_ids = ids[ids >= vocab_size].unique().tolist()
            bad_toks = [tokenizer.convert_ids_to_tokens(int(i)) for i in bad_ids]
            print(f"[warn] out-of-vocab ids: {bad_ids}")
            print(f"[warn] out-of-vocab tokens: {bad_toks}")

            # 1) 가장 보수적인 즉시 복구: 임베딩을 입력의 최대 id+1 로 리사이즈
            new_size = max_id + 1
            print(f"[fix] resize embeddings to {new_size}")
            model.resize_token_embeddings(new_size)
            vocab_size = new_size  # 갱신
    RE_train = RE_Dataset(tokenized_train, train_y)

    if dev_csv is not None:
        dev_df = load_data(dev_csv)
        try:
            dev_y = [label_map[v] for v in dev_df['label'].values]
        except KeyError as e:
            missing = set(dev_df['label'].unique()) - set(label_list)
            raise ValueError(f"[dev] labels not in label_list: {missing}") from e

        tokenized_dev = tokenized_dataset(dev_df, tokenizer)
        tokenized_dev.pop("token_type_ids", None)
        # dev에서도 안전검사(선택)
        with torch.no_grad():
            max_id_dev = int(tokenized_dev["input_ids"].max().item())
            if max_id_dev >= vocab_size:
                raise RuntimeError(
                    f"[dev] Input id ({max_id_dev}) >= embedding size ({vocab_size}). "
                    f"Did tokenizer change after tokenizing?"
                )
        RE_dev = RE_Dataset(tokenized_dev, dev_y)
    else:
        RE_dev = None

    # ---- 4) TrainingArguments (HF 4.55 API: eval_strategy 사용) ----
    has_dev = RE_dev is not None
    evaluation_strategy = 'steps' if RE_dev is not None else 'no'

    training_args = TrainingArguments(
        output_dir=cfg.output_dir,
        save_total_limit=cfg.save_total_limit,
        save_steps=cfg.save_steps,
        num_train_epochs=cfg.num_train_epochs,
        learning_rate=cfg.learning_rate,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        warmup_steps=cfg.warmup_steps,
        weight_decay=cfg.weight_decay,
        logging_dir=os.path.join(cfg.output_dir, "logs"),
        logging_steps=cfg.logging_steps,
        eval_strategy=evaluation_strategy,            # ← 이름 주의
        eval_steps=cfg.eval_steps if has_dev else None,
        load_best_model_at_end=cfg.load_best_model_at_end if has_dev else False,
        metric_for_best_model="micro f1 score" if has_dev else None,  # ← EarlyStopping용
        greater_is_better=True,
        seed=cfg.seed,
        fp16=cfg.fp16,
        remove_unused_columns=False,
        dataloader_pin_memory=torch.cuda.is_available(),
        report_to="none",
    )

    # === 5) Trainer ===
    compute_metrics = make_compute_metrics(label_list, no_rel="no_relation") if has_dev else None

    callbacks = []
    if has_dev and training_args.metric_for_best_model and training_args.load_best_model_at_end:
        callbacks.append(EarlyStoppingCallback(
            early_stopping_patience=2,
            early_stopping_threshold=0.002,
        ))
    # dev 없으면 EarlyStopping 미사용

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=RE_train,
        eval_dataset=RE_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        callbacks=callbacks,
    )

    # ---- 6) Train ----
    trainer.train()

    # ---- 7) Save best (or final) model ----
    os.makedirs(save_best_to, exist_ok=True)
    trainer.save_model(save_best_to)
    if trainer.tokenizer is not None:
        trainer.tokenizer.save_pretrained(save_best_to)

    print(f"Model saved to: {save_best_to}")
    return trainer



In [3]:
# grid_runner.py
import os, gc, traceback, json, time
import torch
import pandas as pd
from dataclasses import replace
from typing import List, Dict, Any, Optional

# 당신이 제공한 train_re, TrainConfig, DEFAULT_LABEL_LIST 를 import
# from train_module import train_re, TrainConfig, DEFAULT_LABEL_LIST

def run_grid(
    train_csv: str,
    dev_csv: Optional[str],
    models: List[str],
    lrs: List[float] = (5e-5, 3e-5, 2e-5),
    epochs: List[int] = (5, 10),
    train_bsz: List[int] = (16,),
    eval_bsz: List[int] = (16,),
    seed: int = 42,
    base_out: str = "./grid_runs",
    label_list: List[str] = None,
    use_fp16_if_cuda: bool = True,
    special_tokens: Optional[List[str]] = ["[E1]","[/E1]","[E2]","[/E2]",
                                           "[E1-PER]","[E2-PER]","[E1-ORG]","[E2-ORG]",
                                           "[E1-LOC]","[E2-LOC]","[E1-UNK]","[E2-UNK]"],
) -> pd.DataFrame:
    """
    여러 모델/하이퍼파라미터 조합을 순차적으로 학습하고 dev 성능을 수집하여 DataFrame으로 반환.
    실패한 러나는 error 컬럼에 스택트레이스를 남김.
    """
    if label_list is None:
        from __main__ import DEFAULT_LABEL_LIST as _DEF  # 노트북에서 직접 실행 대비
        label_list = _DEF

    os.makedirs(base_out, exist_ok=True)
    results: List[Dict[str, Any]] = []

    combo_idx = 0
    total = len(models) * len(lrs) * len(epochs) * len(train_bsz)
    print(f"[grid] total runs: {total}")

    for model_name in models:
        for lr in lrs:
            for ep in epochs:
                for bsz in train_bsz:
                    combo_idx += 1
                    run_name = f"{model_name.replace('/','_')}_lr{lr:g}_ep{ep}_bs{bsz}"
                    out_dir = os.path.join(base_out, run_name)
                    best_dir = os.path.join(out_dir, "best")

                    print(f"\n[grid {combo_idx}/{total}] {run_name}")

                    # CUDA/메모리 정리
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    gc.collect()

                    cfg = TrainConfig(
                        model_name=model_name,
                        output_dir=out_dir,
                        num_train_epochs=ep,
                        learning_rate=lr,
                        per_device_train_batch_size=bsz,
                        per_device_eval_batch_size=eval_bsz[0],
                        warmup_steps=0,
                        weight_decay=0.01,
                        logging_steps=50,
                        save_steps=200,
                        eval_steps=200 if dev_csv else None,
                        save_total_limit=3,
                        load_best_model_at_end=bool(dev_csv),
                        seed=seed,
                        max_length=256,
                        fp16=torch.cuda.is_available() and use_fp16_if_cuda,
                        special_tokens=special_tokens,
                    )

                    row: Dict[str, Any] = {
                        "model": model_name, "lr": lr, "epochs": ep, "train_bsz": bsz,
                        "output_dir": out_dir, "best_dir": best_dir,
                        "micro_f1": None, "auprc": None, "accuracy": None,
                        "best_ckpt": None, "error": None, "seconds": None,
                    }

                    t0 = time.time()
                    try:
                        trainer = train_re(
                            train_csv=train_csv,
                            dev_csv=dev_csv,
                            label_list=label_list,
                            cfg=cfg,
                            save_best_to=best_dir,
                        )

                        # dev가 있으면 evaluate로 표준화된 측정
                        if dev_csv:
                            metrics = trainer.evaluate()
                            # 키가 "micro f1 score"로 들어오므로 공백 제거한 alias도 만들어 둠
                            row["micro_f1"] = metrics.get("micro f1 score")
                            row["auprc"]    = metrics.get("auprc")
                            row["accuracy"] = metrics.get("accuracy")
                        else:
                            # dev 없을 경우 마지막 train logs에서 꺼내거나 None
                            pass

                        # best checkpoint 경로
                        state = getattr(trainer, "state", None)
                        row["best_ckpt"] = getattr(state, "best_model_checkpoint", None)

                    except Exception as e:
                        row["error"] = f"{type(e).__name__}: {e}\n" + traceback.format_exc(limit=2)
                        print("[error]", row["error"])
                    finally:
                        row["seconds"] = round(time.time() - t0, 2)
                        results.append(row)

                        # GPU 메모리 정리
                        if torch.cuda.is_available():
                            torch.cuda.empty_cache()
                        gc.collect()

    df = pd.DataFrame(results).sort_values(
        by=["micro_f1", "auprc", "accuracy"], ascending=False, na_position="last"
    ).reset_index(drop=True)

    # CSV로도 저장
    csv_path = os.path.join(base_out, "grid_summary.csv")
    df.to_csv(csv_path, index=False, encoding="utf-8-sig")
    print(f"\n[grid] summary saved: {csv_path}")
    return df




In [None]:
MODELS = [
    "klue/bert-base",
    "klue/roberta-base",
    "monologg/koelectra-base-v3-discriminator",
    "bert-base-multilingual-cased",
    "kykim/bert-kor-base",
    "monologg/koelectra-base-v3",
    "BM-K/KoSimCSE-roberta-multitask"
    # 필요시 추가
]

df = run_grid(
    train_csv="/content/drive/MyDrive/Colab Notebooks/upstage/dataset/train.csv",
    dev_csv=None,     # dev가 없으면 None
    models=MODELS,
    lrs=[5e-5, 3e-5, 2e-5],
    epochs=[5, 10],
    train_bsz=[16, 32],
    base_out="./grid_runs_re",
)
df.head()


[grid] total runs: 84

[grid 1/84] klue_bert-base_lr5e-05_ep5_bs16


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[info] added 12 special tokens


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at klue/bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


[check] vocab_size=32012, max_input_id=32011


  trainer = Trainer(


Step,Training Loss
50,2.5475
100,2.2474
150,2.0206
200,2.0133
250,1.7793
