<a href="https://colab.research.google.com/github/Cheeyoung-Yoon/upstage_test/blob/main/logic_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
from dataclasses import dataclass
from typing import List

DEFAULT_LABEL_LIST: List[str] = [
    'no_relation','org:top_members/employees','org:members','org:product','per:title',
    'org:alternate_names','per:employee_of','org:place_of_headquarters','per:product',
    'org:number_of_employees/members','per:children','per:place_of_residence',
    'per:alternate_names','per:other_family','per:colleagues','per:origin',
    'per:siblings','per:spouse','org:founded','org:political/religious_affiliation',
    'org:member_of','per:parents','org:dissolved','per:schools_attended',
    'per:date_of_death','per:date_of_birth','per:place_of_birth','per:place_of_death',
    'org:founded_by','per:religion'
]

@dataclass
class TrainConfig:
    model_name: str = "klue/roberta-base"
    output_dir: str = "./runs"
    num_train_epochs: int = 5
    learning_rate: float = 2e-5
    per_device_train_batch_size: int = 16
    per_device_eval_batch_size: int = 16
    warmup_ratio: float = 0.05
    weight_decay: float = 0.01
    logging_steps: int = 500
    save_steps: int = 500
    eval_steps: int = 500
    save_total_limit: int = 2
    load_best_model_at_end: bool = True
    seed: int = 42
    max_length: int = 256
    fp16: bool = True
    es_patience: int = 2

    # 표현/토큰
    inline_markers: bool = True
    marker_variant: str = "typed"   # ["typed","plain"]

    # 손실/정규화/스케줄러
    label_smoothing: float = 0.1
    lr_scheduler_type: str = "cosine"
    use_class_weight: bool = False
    use_cb_loss: bool = False
    use_focal: bool = False
    focal_gamma: float = 2.0
    rdrop_alpha: float = 0.0

    # 최적화
    use_llrd: bool = False
    llrd_decay: float = 0.95

    # 구조/트릭
    use_marker_head: bool = True
    use_erpe: bool = False
    erpe_dim: int = 32
    use_fgm: bool = False
    fgm_eps: float = 1e-3

    # 하드 네거티브
    use_hardneg: bool = False
    hardneg_tau: float = 0.55
    hardneg_boost: float = 2.0

In [29]:

# data_plus.py
import pandas as pd
import numpy as np
import torch
import re
import ast
from typing import Optional

class RE_Dataset(torch.utils.data.Dataset):
    """ tokenized dict + labels (+ optional weights) """
    def __init__(self, pair_dataset: dict, labels, weights: Optional[np.ndarray]=None):
        self.pair_dataset = pair_dataset
        self.labels = labels
        self.weights = weights if weights is not None else np.ones(len(labels), dtype=np.float32)

    def __getitem__(self, idx):
        item = {k: v[idx].clone().detach() for k, v in self.pair_dataset.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def preprocessing_dataset(df: pd.DataFrame) -> pd.DataFrame:
    def pick_word(s):
        try:
            d = ast.literal_eval(s) if isinstance(s, str) else s
            return d.get("word"), d.get("type")
        except Exception:
            return None, None
    subj_words, obj_words = [], []
    for s, o in zip(df["subject_entity"], df["object_entity"]):
        sw, _ = pick_word(s); ow, _ = pick_word(o)
        subj_words.append(sw or "<SUBJ>"); obj_words.append(ow or "<OBJ>")
    out = pd.DataFrame({
        "id": df["id"],
        "sentence": df["sentence"],
        "subject_entity": df["subject_entity"],
        "object_entity": df["object_entity"],
        "subj_word": subj_words, "obj_word": obj_words,
        "label": df["label"],
    })
    return out


def load_data(csv_path: str) -> pd.DataFrame:
    # Accept a single path or (path,) tuple/list and normalize to string
    if isinstance(csv_path, (list, tuple)):
        csv_path = csv_path[0]
    return preprocessing_dataset(pd.read_csv(csv_path))


def _inline_mark(sentence: str, s_word: str, o_word: str, s_type: Optional[str], o_type: Optional[str], use_type: bool, use_unk: bool):
    # 첫 등장만 치환 (단어 경계 고려)
    def repl_first(text, pat, repl):
        m = re.search(rf'(?<!\w){re.escape(pat)}(?!\w)', text)
        if not m: return text
        return text[:m.start()] + repl + text[m.start():m.end()].replace(pat,"") + text[m.end():]

    if use_type:
        s_type = s_type or ("UNK" if use_unk else None)
        o_type = o_type or ("UNK" if use_unk else None)
        if s_type and o_type:
            s_tag = f"[E1-{s_type}]{s_word}[/E1]"
            o_tag = f"[E2-{o_type}]{o_word}[/E2]"
        else:
            s_tag = f"[E1]{s_word}[/E1]"; o_tag = f"[E2]{o_word}[/E2]"
    else:
        s_tag = f"[E1]{s_word}[/E1]"; o_tag = f"[E2]{o_word}[/E2]"

    tmp = repl_first(sentence, s_word, s_tag)
    tmp = repl_first(tmp, o_word, o_tag)
    return tmp
def tokenized_dataset(df, tokenizer, *,
                      inline_markers=True, marker_variant="typed", use_unk=True, max_len=256, use_erpe=False):
    enc_inputs = []
    for _, r in df.iterrows():
        s = ast.literal_eval(r["subject_entity"]) if isinstance(r["subject_entity"], str) else r["subject_entity"]
        o = ast.literal_eval(r["object_entity"]) if isinstance(r["object_entity"], str) else r["object_entity"]

        if inline_markers:
            text = _inline_mark(
                r["sentence"],
                s.get("word") if s else r["subj_word"],
                o.get("word") if o else r["obj_word"],
                (s or {}).get("type"), (o or {}).get("type"),
                use_type=(marker_variant == "typed"), use_unk=use_unk
            )
            enc_inputs.append(text)
        else:
            # 두 개의 separate sequence로 구성
            if marker_variant == "typed":
                span = f"[E1-{(s or {}).get('type','UNK')}]{(s or {}).get('word','<SUBJ>')}[/E1] " \
                       f"[E2-{(o or {}).get('type','UNK')}]{(o or {}).get('word','<OBJ>')}[/E2]"
            else:
                span = f"[E1]{(s or {}).get('word','<SUBJ>')}[/E1] [E2]{(o or {}).get('word','<OBJ>')}[/E2]"
            enc_inputs.append((span, r["sentence"]))

    # 🔹 방어 코드: inline_markers 여부에 따라 tokenizer 입력 방식 결정
    if inline_markers:
        if enc_inputs and isinstance(enc_inputs[0], tuple):
            raise ValueError("[tokenized_dataset] inline_markers=True인데 tuple 형식이 감지됨.")
        enc = tokenizer(
            enc_inputs,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_len,
            add_special_tokens=True
        )
    else:
        if not enc_inputs or not isinstance(enc_inputs[0], tuple):
            raise ValueError("[tokenized_dataset] inline_markers=False인데 tuple 형식이 아님.")
        a, b = zip(*enc_inputs)
        enc = tokenizer(
            list(a), list(b),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=max_len,
            add_special_tokens=True
        )

    # ERPE 추가 처리
    if use_erpe:
        e1_id = tokenizer.convert_tokens_to_ids("[E1]")
        e2_id = tokenizer.convert_tokens_to_ids("[E2]")

        def relpos(ids, mark_id, clip=128):
            pos = (ids == mark_id).nonzero(as_tuple=True)[0]
            m = int(pos[0]) if len(pos) else 0
            ar = torch.arange(ids.size(0)) - m
            ar.clamp_(-clip, clip).add_(clip)
            return ar

        input_ids = enc["input_ids"]
        enc["e1_relpos"] = torch.stack([relpos(row, e1_id) for row in input_ids])
        enc["e2_relpos"] = torch.stack([relpos(row, e2_id) for row in input_ids])

    enc.pop("token_type_ids", None)
    return enc


In [30]:

# model_builders.py
import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig

class REMarkerHead(nn.Module):
    def __init__(self, base_model, hidden_size, num_labels, use_cls=False, dropout=0.1, e1_id=None, e2_id=None):
        super().__init__()
        self.backbone = base_model
        self.use_cls = use_cls
        self.e1_id = e1_id
        self.e2_id = e2_id
        in_dim = hidden_size * (3 if use_cls else 2)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(in_dim, num_labels)

    @staticmethod
    def _pick_first(mask, H):
        idx = mask.float().argmax(dim=1)
        return H[torch.arange(H.size(0), device=H.device), idx]

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, **kw):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        H = out.last_hidden_state
        e1_mask = (input_ids == self.e1_id); e2_mask = (input_ids == self.e2_id)
        h1 = self._pick_first(e1_mask, H); h2 = self._pick_first(e2_mask, H)
        feats = [h1, h2]
        if self.use_cls: feats.append(H[:,0,:])
        x = self.dropout(torch.cat(feats, dim=-1))
        logits = self.classifier(x)
        return {"logits": logits}

class REMarkerHeadERPE(REMarkerHead):
    def __init__(self, base_model, hidden_size, num_labels, erpe_dim=32, rel_vocab=257, **kw):
        super().__init__(base_model, hidden_size, num_labels, **kw)
        self.e1_pos_emb = nn.Embedding(rel_vocab, erpe_dim)
        self.e2_pos_emb = nn.Embedding(rel_vocab, erpe_dim)
        self.proj = nn.Linear(hidden_size + 2*erpe_dim, hidden_size)

    def forward(self, input_ids=None, attention_mask=None, e1_relpos=None, e2_relpos=None, **kw):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        H = out.last_hidden_state
        P = torch.cat([self.e1_pos_emb(e1_relpos), self.e2_pos_emb(e2_relpos)], dim=-1)
        H = self.proj(torch.cat([H, P], dim=-1))
        e1_mask = (input_ids == self.e1_id); e2_mask = (input_ids == self.e2_id)
        h1 = self._pick_first(e1_mask, H); h2 = self._pick_first(e2_mask, H)
        x = self.dropout(torch.cat([h1, h2], dim=-1))
        logits = self.classifier(x)
        return {"logits": logits}

def build_model(model_name: str, num_labels: int, tokenizer, *, use_marker_head=True, use_erpe=False, erpe_dim=32):
    cfg = AutoConfig.from_pretrained(model_name, num_labels=num_labels)
    base = AutoModel.from_pretrained(model_name, config=cfg)
    # special tokens resize는 바깥에서 이미 처리했다고 가정

    if not use_marker_head:
        from transformers import AutoModelForSequenceClassification
        return AutoModelForSequenceClassification.from_pretrained(model_name, config=cfg)

    e1_id = tokenizer.convert_tokens_to_ids("[E1]")
    e2_id = tokenizer.convert_tokens_to_ids("[E2]")
    if use_erpe:
        return REMarkerHeadERPE(base_model=base, hidden_size=cfg.hidden_size, num_labels=num_labels,
                                erpe_dim=erpe_dim, e1_id=e1_id, e2_id=e2_id)
    else:
        return REMarkerHead(base_model=base, hidden_size=cfg.hidden_size, num_labels=num_labels,
                            use_cls=False, e1_id=e1_id, e2_id=e2_id)




In [31]:

# trainer_plus.py
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, WeightedRandomSampler
from transformers import Trainer

def build_llrd_param_groups(model, base_lr=2e-5, lr_decay=0.95, wd=0.01):
    groups = []

    def collect(names):
        return [p for n,p in model.named_parameters() if any(k in n for k in names) and p.requires_grad]

    # 🔧 임베딩을 word / position / layer-norm으로 분리
    emb_word = collect(["embeddings.word_embeddings"])
    emb_pos  = collect(["embeddings.position_embeddings"])
    emb_ln   = collect(["embeddings.LayerNorm"])

    if emb_word: groups.append({"params": emb_word, "lr": base_lr*(lr_decay**12), "weight_decay": wd})
    if emb_pos:  groups.append({"params": emb_pos,  "lr": base_lr*(lr_decay**12), "weight_decay": 0.0})  # 보통 decay=0
    if emb_ln:   groups.append({"params": emb_ln,   "lr": base_lr*(lr_decay**12), "weight_decay": 0.0})

    for i in range(12):
        groups.append({"params": collect([f"encoder.layer.{i}"]), "lr": base_lr*(lr_decay**(11-i)), "weight_decay": wd})

    # pooler / classifier
    groups.append({"params": collect(["pooler", "classifier"]), "lr": base_lr, "weight_decay": wd})
    return groups

class TrainerPlus(Trainer):
    def __init__(self, *args,
                 class_weights=None,
                 use_focal=False, focal_gamma=2.0,
                 rdrop_alpha=0.0,
                 use_llrd=False, llrd_decay=0.95,
                 optimizer_betas=(0.9, 0.999),
                 wd=0.01,
                 use_fgm=False, fgm_eps=1e-3,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.use_focal = use_focal
        self.focal_gamma = focal_gamma
        self.rdrop_alpha = rdrop_alpha
        self.use_llrd = use_llrd
        self.llrd_decay = llrd_decay
        self.optimizer_betas = optimizer_betas
        self._wd = wd
        self.use_fgm = use_fgm
        self.fgm_eps = fgm_eps
        self._fgm_backup = {}

    # ---- losses ----
    # Fix device handling in TrainerPlus
    def _ce(self, logits, labels):
        weights = self.class_weights
        if weights is not None:
            weights = weights.to(logits.device)  # Ensure same device
        return F.cross_entropy(logits, labels, weight=weights)

    def _focal(self, logits, labels):
        weights = self.class_weights
        if weights is not None:
            weights = weights.to(logits.device)  # Ensure same device
        ce = F.cross_entropy(logits, labels, reduction="none", weight=weights)
        p = logits.softmax(dim=-1)[torch.arange(len(labels), device=logits.device), labels]
        return ((1 - p) ** self.focal_gamma * ce).mean()
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs: bool = False,
        num_items_in_batch=None,   # ✅ 최신 HF가 넘기는 인자 수용
        **kwargs,                  # ✅ 앞으로의 확장 대비
    ):
        labels = inputs["labels"]
        out1 = model(**inputs)
        logits1 = out1["logits"]

        base = self._focal(logits1, labels) if self.use_focal else self._ce(logits1, labels)

        if self.rdrop_alpha > 0 and model.training:
            out2 = model(**inputs)
            logits2 = out2["logits"]
            base2 = self._focal(logits2, labels) if self.use_focal else self._ce(logits2, labels)
            base = 0.5 * (base + base2)
            p1 = logits1.log_softmax(dim=-1); p2 = logits2.log_softmax(dim=-1)
            kl = F.kl_div(p1, p2.exp(), reduction="batchmean") + F.kl_div(p2, p1.exp(), reduction="batchmean")
            loss = base + 0.5 * self.rdrop_alpha * kl
            return (loss, out1) if return_outputs else loss

        return (base, out1) if return_outputs else base

    # ---- FGM by overriding training_step (proper sequence) ----
    def _fgm_attack(self, emb_name="embeddings.word_embeddings"):
        for n, p in self.model.named_parameters():
            if p.requires_grad and emb_name in n and p.grad is not None:
                self._fgm_backup[n] = p.data.clone()
                g = p.grad / (p.grad.norm() + 1e-12)
                p.data.add_(self.fgm_eps * g)

    def _fgm_restore(self):
        for n, p in self.model.named_parameters():
            if n in self._fgm_backup:
                p.data = self._fgm_backup[n]
        self._fgm_backup.clear()


    def training_step(self, model, inputs, num_items_in_batch=None):
        # 1) HF 기본 training_step 먼저 호출 → AMP/Scaler/accumulation 모두 안전 처리
        base_loss_detached = super().training_step(model, inputs, num_items_in_batch)

        # 2) FGM 있으면 두 번째 forward/backward (스케일러 규칙 그대로 맞춰줌)
        if self.use_fgm:
            self._fgm_attack()
            with self.autocast_smart_context_manager():
                adv_loss = self.compute_loss(model, self._prepare_inputs(inputs))
            if self.args.n_gpu > 1:
                adv_loss = adv_loss.mean()
            adv_loss = adv_loss / self.args.gradient_accumulation_steps

            # HF가 설정한 스케일러 플래그를 그대로 사용
            use_scaler = getattr(self, "do_grad_scaling", False) and getattr(self, "scaler", None) is not None
            if use_scaler:
                self.scaler.scale(adv_loss).backward()
            else:
                adv_loss.backward()
            self._fgm_restore()

        return base_loss_detached


    # ---- optimizer with LLRD ----
    def create_optimizer(self):
        if self.optimizer is not None:
            return self.optimizer
        lr = self.args.learning_rate; wd = self._wd; betas = self.optimizer_betas
        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]
        if self.use_llrd:
            base_groups = build_llrd_param_groups(self.model, base_lr=lr, lr_decay=self.llrd_decay, wd=wd)
            groups = []
            for g in base_groups:
                dec, nde = [], []
                for n, p in self.model.named_parameters():
                    if p not in g["params"] or not p.requires_grad: continue
                    (nde if any(nd in n for nd in no_decay) else dec).append(p)
                if dec: groups.append({"params": dec, "lr": g["lr"], "weight_decay": wd})
                if nde: groups.append({"params": nde, "lr": g["lr"], "weight_decay": 0.0})
            param_groups = groups
        else:
            dec, nde = [], []
            for n,p in self.model.named_parameters():
                if not p.requires_grad: continue
                (nde if any(nd in n for nd in no_decay) else dec).append(p)
            param_groups = [
                {"params": dec, "weight_decay": wd, "lr": lr},
                {"params": nde, "weight_decay": 0.0, "lr": lr},
            ]
        self.optimizer = torch.optim.AdamW(param_groups, lr=lr, betas=betas)
        return self.optimizer

    # ---- weighted sampler (for hard-neg callback) ----
    def get_train_dataloader(self):
        if hasattr(self.train_dataset, "weights") and self.train_dataset.weights is not None:
            sampler = WeightedRandomSampler(self.train_dataset.weights, num_samples=len(self.train_dataset), replacement=True)
            return DataLoader(self.train_dataset, batch_size=self.args.train_batch_size,
                              sampler=sampler, collate_fn=self.data_collator)
        return super().get_train_dataloader()



In [32]:

# hardneg_callback.py
import torch, numpy as np
from torch.utils.data import DataLoader
from transformers import TrainerCallback

class HardNegSampler(TrainerCallback):
    def __init__(self, no_rel_id=0, tau=0.55, boost=2.0):
        self.no_rel_id = no_rel_id
        self.tau = tau
        self.boost = boost

    def on_epoch_end(self, args, state, control, **kw):
        tr = kw["trainer"]
        ds = tr.train_dataset
        if not hasattr(ds, "weights"):
            ds.weights = np.ones(len(ds), dtype=np.float32)

        dl = DataLoader(ds, batch_size=args.per_device_eval_batch_size)
        probs_all, labels_all = [], []
        tr.model.eval()
        with torch.no_grad():
            for batch in dl:
                batch = {k: v.to(tr.model.device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
                logits = tr.model(**batch)["logits"]
                probs_all.append(logits.softmax(-1).cpu())
                labels_all.append(batch["labels"].cpu())
        probs = torch.cat(probs_all).numpy()
        labels = torch.cat(labels_all).numpy()
        p_nr = probs[:, self.no_rel_id]
        hard = (labels == self.no_rel_id) & (p_nr < self.tau)

        w = ds.weights.astype(np.float32)
        w[hard] *= self.boost
        ds.weights = w
        tr.train_dataloader = None  # 재생성 트리거



In [33]:
# train_re.py
import os, numpy as np, torch
from typing import List, Optional
from dataclasses import replace
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, TrainingArguments, EarlyStoppingCallback
import sklearn

# from parts_config import TrainConfig, DEFAULT_LABEL_LIST
# from data_plus import RE_Dataset, load_data, tokenized_dataset
# from model_builders import build_model
# from trainer_plus import TrainerPlus

def train_re(
    train_csv: str,
    dev_csv: Optional[str],
    label_list: List[str] = DEFAULT_LABEL_LIST,
    cfg: TrainConfig = TrainConfig(),
    save_best_to: str = "./best_model",
    callbacks=None,
):
    torch.manual_seed(cfg.seed); np.random.seed(cfg.seed)

    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)
    special_tokens = ["[E1]","[/E1]","[E2]","[/E2]"]
    if cfg.marker_variant == "typed":
        special_tokens += ["[E1-PER]","[E2-PER]","[E1-ORG]","[E2-ORG]","[E1-LOC]","[E2-LOC]","[E1-UNK]","[E2-UNK]"]
    added = tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})

    model = build_model(
        cfg.model_name,
        num_labels=len(label_list),
        tokenizer=tokenizer,
        use_marker_head=cfg.use_marker_head,
        use_erpe=cfg.use_erpe,
        erpe_dim=cfg.erpe_dim
    )

    if added > 0:
        if hasattr(model, "resize_token_embeddings"):
            model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
        elif hasattr(model, "backbone") and hasattr(model.backbone, "resize_token_embeddings"):
            model.backbone.resize_token_embeddings(len(tokenizer), mean_resizing=False)

    full_df = load_data(train_csv)
    if dev_csv:
        train_df, dev_df = full_df, load_data(dev_csv)
    else:
        train_df, dev_df = train_test_split(full_df, test_size=0.1, random_state=cfg.seed, stratify=full_df["label"])

    label_map = {v: i for i, v in enumerate(label_list)}
    y_tr = [label_map[v] for v in train_df["label"].values]
    y_dv = [label_map[v] for v in dev_df["label"].values]

    tok_tr = tokenized_dataset(train_df, tokenizer,
                               inline_markers=cfg.inline_markers, marker_variant=cfg.marker_variant,
                               max_len=cfg.max_length, use_erpe=cfg.use_erpe)
    tok_dv = tokenized_dataset(dev_df, tokenizer,
                               inline_markers=cfg.inline_markers, marker_variant=cfg.marker_variant,
                               max_len=cfg.max_length, use_erpe=cfg.use_erpe)

    ds_tr = RE_Dataset(tok_tr, y_tr)
    ds_dv = RE_Dataset(tok_dv, y_dv)

    # class weights
    # class weights
    class_weights = None
    if cfg.use_cb_loss:
        beta = 0.999
        counts = np.bincount(y_tr, minlength=len(label_list))
        eff_num = 1.0 - np.power(beta, counts)
        cbw = (1.0 - beta) / np.clip(eff_num, 1e-6, None)
        cbw = cbw / cbw.mean()
        class_weights = torch.tensor(cbw, dtype=torch.float32)  # Remove device specification
    elif cfg.use_class_weight:
        counts = np.bincount(y_tr, minlength=len(label_list))
        inv = 1.0 / np.clip(counts, 1, None)
        w = inv / inv.mean()
        class_weights = torch.tensor(w, dtype=torch.float32)  # Remove device specification

    # Fix: Move this to the correct indentation level (same as the class_weights section above)

    args = TrainingArguments(
        output_dir=cfg.output_dir,
        save_total_limit=cfg.save_total_limit,

        eval_strategy="steps",   # ✅ 철자 정확
        save_strategy="steps",
        eval_steps=cfg.eval_steps,
        save_steps=cfg.eval_steps,

        load_best_model_at_end=True,
        metric_for_best_model="micro_f1",  # ✅ compute_metrics 리턴 키와 동일 (eval_ 붙이지 않음)
        greater_is_better=True,

        label_names=["labels"],

        num_train_epochs=cfg.num_train_epochs,
        learning_rate=cfg.learning_rate,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        weight_decay=cfg.weight_decay,

        logging_strategy="steps",
        logging_steps=cfg.logging_steps,

        fp16=cfg.fp16 and torch.cuda.is_available(),
        seed=cfg.seed,
        remove_unused_columns=False,
        dataloader_pin_memory=torch.cuda.is_available(),
        report_to="none",
        label_smoothing_factor=cfg.label_smoothing,
        lr_scheduler_type=cfg.lr_scheduler_type,
        warmup_ratio=cfg.warmup_ratio,
    )

    def micro_f1_wo_no_relation(preds, labels, label_list, no_rel="no_relation"):
        no_rel_idx = label_list.index(no_rel)
        use_labels = [i for i in range(len(label_list)) if i != no_rel_idx]
        return sklearn.metrics.f1_score(labels, preds, average="micro", labels=use_labels) * 100.0

    def auprc_all(probs, labels, num_labels):
        labels_oh = np.eye(num_labels)[labels]
        score = []
        for c in range(num_labels):
            t = labels_oh[:, c]; p = probs[:, c]
            prec, rec, _ = sklearn.metrics.precision_recall_curve(t, p)
            score.append(sklearn.metrics.auc(rec, prec))
        return float(np.mean(score) * 100.0)


    def compute_metrics(eval_pred):
        try:
            # HF가 EvalPrediction을 주는 버전/케이스도 있고 tuple도 있어서 모두 대응
            if hasattr(eval_pred, "predictions"):
                logits, labels = eval_pred.predictions, eval_pred.label_ids
            else:
                logits, labels = eval_pred

            import numpy as np
            if isinstance(logits, tuple): logits = logits[0]
            logits = np.asarray(logits)
            labels = np.asarray(labels)

            preds = logits.argmax(-1)

            # 확률 계산: torch 없이 numpy로 안전하게
            # (오버플로우 방지용 안정화 softmax)
            logits_max = logits.max(axis=1, keepdims=True)
            exps = np.exp(logits - logits_max)
            probs = exps / exps.sum(axis=1, keepdims=True)

            from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, auc
            no_rel_idx = DEFAULT_LABEL_LIST.index("no_relation")
            use_labels = [i for i in range(len(DEFAULT_LABEL_LIST)) if i != no_rel_idx]

            micro_f1 = f1_score(labels, preds, average="micro", labels=use_labels) * 100.0

            # AUPRC 계산 중 클래스가 비어 있을 수 있으니 try로 보호
            try:
                labels_oh = np.eye(logits.shape[1], dtype=int)[labels]
                auprs = []
                for c in range(logits.shape[1]):
                    t = labels_oh[:, c]; p = probs[:, c]
                    # precision_recall_curve는 반환 (prec, rec, thr)
                    prec, rec, _ = precision_recall_curve(t, p)
                    # auc(x, y)에서 x=rec, y=prec
                    auprs.append(auc(rec, prec))
                auprc = float(np.mean(auprs) * 100.0)
            except Exception:
                auprc = float("nan")

            return {
                "micro_f1": float(micro_f1),
                "auprc": float(auprc),
                "accuracy": float(accuracy_score(labels, preds) * 100.0),
            }
        except Exception as e:
            # 어떤 경우에도 빈 dict가 넘어가면 안 되므로 최소 메트릭 반환
            return {
                "micro_f1": 0.0,
                "auprc": 0.0,
                "accuracy": 0.0,
            }


    # default_cbs = [EarlyStoppingCallback(early_stopping_patience=5)]
    # use_callbacks = default_cbs + (callbacks or [])

    trainer = TrainerPlus(
        model=model, args=args,
        train_dataset=ds_tr, eval_dataset=ds_dv,
        compute_metrics=compute_metrics, processing_class=tokenizer,
        class_weights=class_weights,
        use_focal=cfg.use_focal, focal_gamma=cfg.focal_gamma,
        rdrop_alpha=cfg.rdrop_alpha,
        use_llrd=cfg.use_llrd, llrd_decay=cfg.llrd_decay,
        wd=cfg.weight_decay,
        use_fgm=cfg.use_fgm, fgm_eps=cfg.fgm_eps,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=cfg.es_patience)]
    )

    trainer.train()
    os.makedirs(save_best_to, exist_ok=True)
    trainer.save_model(save_best_to)
    if trainer.tokenizer: trainer.tokenizer.save_pretrained(save_best_to)
    return trainer


In [34]:
# grid_plus.py
import os, time, gc, shutil, inspect
from typing import Optional, List, Dict, Any
from itertools import product
import numpy as np
import pandas as pd
import torch
from transformers import TrainerCallback, EarlyStoppingCallback

# from parts_config import TrainConfig, DEFAULT_LABEL_LIST
# from train_re import train_re
# from hardneg_callback import HardNegSampler

class ConsoleLogger(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"[train] start → out={args.output_dir} | lr={args.learning_rate} | bsz={args.per_device_train_batch_size} | epochs={args.num_train_epochs}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if not state.is_world_process_zero or not logs: return
        keys = ["loss","learning_rate","epoch"]
        msg = " | ".join([f"{k}={logs[k]:.5f}" for k in keys if k in logs])
        for k in ["micro f1 score","auprc","accuracy"]:
            if k in logs: msg += f" | {k}={logs[k]:.3f}"
        print(f"[step {state.global_step}] {msg}")

    def on_train_end(self, args, state, control, **kwargs):
        print(f"[train] end. best={state.best_model_checkpoint}")

def run_grid_plus(
    train_csv: str,
    dev_csv: Optional[str],
    models: List[str],
    hp_space: Dict[str, list],
    base_out: str = "./grid_runs_plus",
    seed_list: List[int] = (42,),
    label_list: List[str] = None,
    extra_callbacks: Optional[List[TrainerCallback]] = None,
) -> pd.DataFrame:
    if label_list is None:
        label_list = DEFAULT_LABEL_LIST
    os.makedirs(base_out, exist_ok=True)

    keys, values = zip(*hp_space.items())
    combos = list(product(*values))

    results = []
    total = len(models) * len(seed_list) * len(combos)
    idx = 0

    for model_name in models:
        for seed in seed_list:
            for vals in combos:
                idx += 1
                opt = dict(zip(keys, vals))

                run_name = (
                    f"{model_name.replace('/','_')}"
                    f"_lr{opt['lr']}_ep{opt['epochs']}_bs{opt['train_bsz']}"
                    f"_ml{opt['max_len']}_mk{opt['marker_variant']}"
                    f"_inline{int(opt['inline_markers'])}"
                    f"_mh{int(opt['use_marker_head'])}_erpe{int(opt['use_erpe'])}_ed{opt['erpe_dim']}"
                    f"_fgm{int(opt['use_fgm'])}"
                    f"_cb{int(opt['use_cb_loss'])}_cw{int(opt['use_class_weight'])}"
                    f"_focal{int(opt['use_focal'])}_rd{opt['rdrop_alpha']}_llrd{int(opt['use_llrd'])}"
                    f"_hn{int(opt['use_hardneg'])}_tau{opt['hardneg_tau']}_boost{opt['hardneg_boost']}"
                    f"_seed{seed}"
                )
                out_dir = os.path.join(base_out, run_name)
                best_dir = os.path.join(out_dir, "best")

                if torch.cuda.is_available(): torch.cuda.empty_cache()
                gc.collect()

                t0 = time.time()
                row = {"run": run_name, "model": model_name, "seed": seed, **opt,
                       "output_dir": out_dir, "best_dir": best_dir,
                       "micro_f1": None, "auprc": None, "accuracy": None,
                       "micro_f1_tta": None, "best_ckpt": None, "seconds": None, "error": None}

                try:
                    cfg = TrainConfig(
                        model_name=model_name,
                        output_dir=out_dir,
                        num_train_epochs=opt["epochs"],
                        learning_rate=opt["lr"],
                        per_device_train_batch_size=opt["train_bsz"],
                        per_device_eval_batch_size=opt["train_bsz"],
                        warmup_ratio=opt["warmup_ratio"],
                        weight_decay=0.01,
                        logging_steps=2000, save_steps=2000, eval_steps=2000,
                        save_total_limit=2, load_best_model_at_end=True,
                        seed=seed, max_length=opt["max_len"], fp16=torch.cuda.is_available(),
                        inline_markers=opt["inline_markers"],
                        marker_variant=opt["marker_variant"],
                        label_smoothing=opt["label_smoothing"],
                        lr_scheduler_type=opt["scheduler"],
                        use_class_weight=opt["use_class_weight"],
                        use_cb_loss=opt["use_cb_loss"],
                        use_focal=opt["use_focal"], focal_gamma=opt["focal_gamma"],
                        rdrop_alpha=opt["rdrop_alpha"],
                        use_llrd=opt["use_llrd"], llrd_decay=opt["llrd_decay"],
                        use_marker_head=opt["use_marker_head"],
                        use_erpe=opt["use_erpe"], erpe_dim=opt["erpe_dim"],
                        use_fgm=opt["use_fgm"], fgm_eps=opt["fgm_eps"],
                        use_hardneg=opt["use_hardneg"],
                        hardneg_tau=opt["hardneg_tau"], hardneg_boost=opt["hardneg_boost"],
                    )

                    cbs: List[TrainerCallback] = [ConsoleLogger(), EarlyStoppingCallback(early_stopping_patience=5)]
                    if cfg.use_hardneg:
                        cbs.append(HardNegSampler(no_rel_id=0, tau=cfg.hardneg_tau, boost=cfg.hardneg_boost))
                    if extra_callbacks: cbs.extend(extra_callbacks)

                    trainer = train_re(
                        train_csv=train_csv, dev_csv=dev_csv,
                        label_list=label_list, cfg=cfg, save_best_to=best_dir,
                        callbacks=cbs
                    )

                    metrics = trainer.evaluate()
                    row["micro_f1"] = metrics.get("micro f1 score")
                    row["auprc"] = metrics.get("auprc")
                    row["accuracy"] = metrics.get("accuracy")
                    state = getattr(trainer, "state", None)
                    row["best_ckpt"] = getattr(state, "best_model_checkpoint", None)

                    # Optional: TTA (MC Dropout)
                    if opt.get("use_tta", False):
                        trainer.model.train()
                        preds = []
                        with torch.no_grad():
                            for _ in range(int(opt.get("tta_n", 4))):
                                out = trainer.predict(trainer.eval_dataset)
                                preds.append(out.predictions)
                        tta_logits = np.mean(preds, axis=0)
                        labels = trainer.predict(trainer.eval_dataset).label_ids
                        from sklearn.metrics import f1_score
                        no_rel = label_list.index("no_relation")
                        y_hat = tta_logits.argmax(-1)
                        row["micro_f1_tta"] = f1_score(labels, y_hat, average="micro", labels=[i for i in range(len(label_list)) if i!=no_rel]) * 100.0

                except Exception as e:
                    row["error"] = f"{type(e).__name__}: {e}"
                finally:
                    row["seconds"] = round(time.time() - t0, 2)
                    results.append(row)
                    # 체크포인트는 남기고, 러닝 로그 폴더만 정리하고 싶으면 아래 줄 주석 처리
                    shutil.rmtree(out_dir, ignore_errors=True)
                    if torch.cuda.is_available(): torch.cuda.empty_cache()
                    gc.collect()

                print(f"[{idx}/{total}] done: {row['run']} | microF1={row['micro_f1']} | err={row['error']}")

                df = pd.DataFrame(results)
                print(df)
                # .sort_values(by=["micro_f1","auprc","accuracy"], ascending=False, na_position="last")
                df.to_csv(os.path.join(base_out, f"{run_name}_grid_summary.csv"), index=False, encoding="utf-8-sig", method = 'a')
    return df


In [None]:

# run.py
# from grid_plus.py import run_grid_plus
# from parts_config import DEFAULT_LABEL_LIST

TRAIN_CSV = "/content/drive/MyDrive/Colab Notebooks/upstage/dataset/train.csv"
DEV_CSV   = None  # 없으면 None

MODELS = [
    "klue/roberta-base",
    # "microsoft/deberta-v3-base",
    # "klue/roberta-large",
]

HP_SPACE = {
    # 기본 HP
    "lr":              [2e-5],
    "epochs":          [10],
    "train_bsz":       [32],
    "max_len":         [256],
    "scheduler":       ["cosine"],
    "warmup_ratio":    [0.05],
    "label_smoothing": [0.1],

    # 표현
    "marker_variant":  ["typed"],
    "inline_markers":  [True],

    # 손실/정규화
    "use_class_weight":[False],
    "use_cb_loss":     [True],
    "use_focal":       [False],
    "focal_gamma":     [2.0],
    "rdrop_alpha":     [0.0, 2.0],

    # 구조/트릭
    "use_marker_head": [True, False],
    "use_erpe":        [False, True],
    "erpe_dim":        [32],
    "use_fgm":         [False, True],
    "fgm_eps":         [1e-3],

    # 최적화
    "use_llrd":        [False, True],
    "llrd_decay":      [0.95],

    # 하드 네거티브
    "use_hardneg":     [False, True],
    "hardneg_tau":     [0.55],
    "hardneg_boost":   [2.0],

    # 평가 옵션
    "use_tta":         [False],   # 필요 시 True 추가
    "tta_n":           [4],
}
# HP_SPACE = {
#     "lr": [2e-5],
#     "epochs": [1],  # Very small for testing
#     "train_bsz": [16],
#     "max_len": [256],
#     "scheduler": ["cosine"],
#     "warmup_ratio": [0.05],
#     "label_smoothing": [0.0],
#     "marker_variant": ["typed"],
#     "inline_markers": [True],
#     "use_class_weight": [False],
#     "use_cb_loss": [False],
#     "use_focal": [False],
#     "focal_gamma": [2.0],
#     "rdrop_alpha": [0.0],
#     "use_marker_head": [True],
#     "use_erpe": [False],
#     "erpe_dim": [32],
#     "use_fgm": [False],
#     "fgm_eps": [1e-3],
#     "use_llrd": [False],
#     "llrd_decay": [0.95],
#     "use_hardneg": [False],
#     "hardneg_tau": [0.55],
#     "hardneg_boost": [2.0],
#     "use_tta": [False],
#     "tta_n": [4],
# }

df = run_grid_plus(
      train_csv=TRAIN_CSV,
      dev_csv=DEV_CSV,
      models=MODELS,
      hp_space=HP_SPACE,
      base_out="/content/drive/MyDrive/Colab Notebooks/upstage/grid_runs_plus",
      seed_list=[42],
      label_list=DEFAULT_LABEL_LIST,
)

df.to_csv("/content/drive/MyDrive/Colab Notebooks/upstage/dataset/result.csv")
  # 상위 10개만 출력
with pd.option_context('display.max_columns', None):
    print(df.head(10))

df['error'].iloc[0]


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Micro F1,Auprc,Accuracy
2000,1.3188,1.03677,73.521187,63.172521,71.789344
4000,0.511,1.091653,75.660932,67.653377,73.544811
6000,0.272,1.290736,76.170392,68.476539,74.376347
8000,0.161,1.453853,76.194558,67.405558,74.591931


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


[1/64] done: klue_roberta-base_lr2e-05_ep10_bs32_ml256_mktyped_inline1_mh1_erpe0_ed32_fgm0_cb1_cw0_focal0_rd0.0_llrd0_hn0_tau0.55_boost2.0_seed42 | microF1=None | err=None


Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Micro F1,Auprc,Accuracy
2000,1.3188,1.036881,73.478167,63.159498,71.727749
