<a href="https://colab.research.google.com/github/Cheeyoung-Yoon/4jua/blob/main/upstage_hyperparam_tune.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
import pickle as pickle
import os
import pandas as pd
import numpy as np
import torch


class RE_Dataset(torch.utils.data.Dataset):
  """ Dataset 구성을 위한 class."""
  def __init__(self, pair_dataset, labels):
    self.pair_dataset = pair_dataset
    self.labels = labels

  def __getitem__(self, idx):
    item = {key: val[idx].clone().detach() for key, val in self.pair_dataset.items()}
    item['labels'] = torch.tensor(self.labels[idx])
    return item

  def __len__(self):
    return len(self.labels)

def preprocessing_dataset(dataset):
  """ 처음 불러온 csv 파일을 원하는 형태의 DataFrame으로 변경 시켜줍니다."""
  subject_entity = []
  object_entity = []
  for i,j in zip(dataset['subject_entity'], dataset['object_entity']):
    i = i[1:-1].split(',')[0].split(':')[1]
    j = j[1:-1].split(',')[0].split(':')[1]

    subject_entity.append(i)
    object_entity.append(j)
  out_dataset = pd.DataFrame({'id':dataset['id'], 'sentence':dataset['sentence'],'subject_entity':subject_entity,'object_entity':object_entity,'label':dataset['label'],})
  return out_dataset

def load_data(dataset_dir):
  """ csv 파일을 경로에 맡게 불러 옵니다. """
  pd_dataset = pd.read_csv(dataset_dir)
  dataset = preprocessing_dataset(pd_dataset)

  return dataset


def tokenized_dataset(dataset, tokenizer, use_type_markers=True, use_unk=True, max_len=256):
    """
    dataset: pandas.DataFrame with columns:
      - sentence
      - subject_entity, object_entity  (dict-like str: {'word':..., 'type':...})
    """
    import ast

    def parse_ent(e):
        if isinstance(e, str):
            try:
                e = ast.literal_eval(e)
            except:
                return None, None
        if isinstance(e, dict):
            return e.get("word"), e.get("type")
        return None, None

    enc_inputs, enc_texts = [], []

    for s_ent, o_ent, sent in zip(dataset['subject_entity'], dataset['object_entity'], dataset['sentence']):
        s_word, s_type = parse_ent(s_ent)
        o_word, o_type = parse_ent(o_ent)

        # 단어가 누락된 경우 안전장치
        s_word = s_word if s_word else "<SUBJ>"
        o_word = o_word if o_word else "<OBJ>"

        if use_type_markers:
            if not s_type and use_unk: s_type = "UNK"
            if not o_type and use_unk: o_type = "UNK"

            if s_type and o_type:
                e_span = f"[E1-{s_type}]{s_word}[/E1] [E2-{o_type}]{o_word}[/E2]"
            else:
                # 타입을 전혀 모르면 타입 없는 일반 마커 사용
                e_span = f"[E1]{s_word}[/E1] [E2]{o_word}[/E2]"
        else:
            # 타입 마커 비활성화: 일반 마커만
            e_span = f"[E1]{s_word}[/E1] [E2]{o_word}[/E2]"

        enc_inputs.append(e_span)
        enc_texts.append(sent)

    # 필요 시 특수 토큰 등록 (한 번만 실행)
    # 타입 마커/일반 마커/종료 마커 + UNK
    special_tokens = {"additional_special_tokens": [
        "[E1]","[/E1]","[E2]","[/E2]",
        "[E1-PER]","[E2-PER]","[E1-ORG]","[E2-ORG]",
        "[E1-LOC]","[E2-LOC]","[E1-UNK]","[E2-UNK]"
    ]}
    num_added = tokenizer.add_special_tokens(special_tokens)
    # model.resize_token_embeddings(len(tokenizer))  # 모델 로드 후 1회 실행

    return tokenizer(
        enc_inputs,
        enc_texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_len,
        add_special_tokens=True,
    )



In [70]:
# trainer_plus.py
import torch
import torch.nn.functional as F
from transformers import Trainer

def build_llrd_param_groups(model, base_lr=2e-5, lr_decay=0.95, wd=0.01):
    """
    RoBERTa 계열 기준: embeddings -> encoder.layer.0..11 -> pooler, classifier
    """
    groups = []

    def add(names, lr, wd):
        params = []
        for n, p in model.named_parameters():
            if any(k in n for k in names) and p.requires_grad:
                params.append(p)
        if params:
            groups.append({"params": params, "lr": lr, "weight_decay": wd})

    # embeddings (가장 낮은 lr)
    add(["embeddings"], base_lr * (lr_decay ** 12), wd)
    # 12개 레이어
    for i in range(12):
        add([f"encoder.layer.{i}"], base_lr * (lr_decay ** (11 - i)), wd)
    # pooler + classifier (가장 높은 lr)
    add(["pooler", "classifier"], base_lr, wd)
    return groups

class TrainerPlus(Trainer):
    """
    - class_weights: Tensor | None
    - use_focal: bool
    - focal_gamma: float
    - rdrop_alpha: float (0이면 off)
    - use_llrd: bool
    - llrd_decay: float
    - optimizer_betas: tuple
    - wd: float
    """
    def __init__(self, *args,
                 class_weights=None,
                 use_focal=False, focal_gamma=2.0,
                 rdrop_alpha=0.0,
                 use_llrd=False, llrd_decay=0.95,
                 optimizer_betas=(0.9, 0.999),
                 wd=0.01,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
        self.use_focal = use_focal
        self.focal_gamma = focal_gamma
        self.rdrop_alpha = rdrop_alpha
        self.use_llrd = use_llrd
        self.llrd_decay = llrd_decay
        self.optimizer_betas = optimizer_betas
        self._wd = wd

    # --- 손실 ---
    def _ce(self, logits, labels):
        return F.cross_entropy(logits, labels, weight=self.class_weights)

    def _focal(self, logits, labels):
        ce = F.cross_entropy(logits, labels, reduction="none", weight=self.class_weights)
        p = logits.softmax(dim=-1)[torch.arange(len(labels), device=logits.device), labels]
        return ((1 - p) ** self.focal_gamma * ce).mean()

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None, **kwargs):
        labels = inputs["labels"]
        outputs1 = model(**inputs)
        logits1 = outputs1.logits
        base = self._focal(logits1, labels) if self.use_focal else self._ce(logits1, labels)

        if self.rdrop_alpha > 0.0 and self.model.training:
            outputs2 = model(**inputs)
            logits2 = outputs2.logits
            base2 = self._focal(logits2, labels) if self.use_focal else self._ce(logits2, labels)
            base = 0.5 * (base + base2)
            p1 = logits1.log_softmax(dim=-1)
            p2 = logits2.log_softmax(dim=-1)
            kl = F.kl_div(p1, p2.exp(), reduction="batchmean") + F.kl_div(p2, p1.exp(), reduction="batchmean")
            loss = base + 0.5 * self.rdrop_alpha * kl
            return (loss, outputs1) if return_outputs else loss

        return (base, outputs1) if return_outputs else base

    # --- 옵티마이저 (LLRD/weight-decay 분리) ---
    def create_optimizer(self):
        if self.optimizer is not None:
            return self.optimizer

        lr = self.args.learning_rate
        wd = self._wd
        betas = self.optimizer_betas

        no_decay = ["bias", "LayerNorm.weight", "LayerNorm.bias"]

        if self.use_llrd:
            # layer-wise lr decay 그룹 + no-decay 분리
            base_groups = build_llrd_param_groups(self.model, base_lr=lr, lr_decay=self.llrd_decay, wd=wd)
            groups = []
            for g in base_groups:
                params_decay = []
                params_nodecay = []
                for p in g["params"]:
                    name = None
                    # param 이름 찾기
                    for n, pp in self.model.named_parameters():
                        if pp is p:
                            name = n; break
                    if name and any(nd in name for nd in no_decay):
                        params_nodecay.append(p)
                    else:
                        params_decay.append(p)
                if params_decay:
                    groups.append({"params": params_decay, "lr": g["lr"], "weight_decay": wd})
                if params_nodecay:
                    groups.append({"params": params_nodecay, "lr": g["lr"], "weight_decay": 0.0})
            param_groups = groups
        else:
            # 일반 no-decay 분리
            decay = []
            nodecay = []
            for n, p in self.model.named_parameters():
                if not p.requires_grad:
                    continue
                (nodecay if any(nd in n for nd in no_decay) else decay).append(p)
            param_groups = [
                {"params": decay, "weight_decay": wd, "lr": lr},
                {"params": nodecay, "weight_decay": 0.0, "lr": lr},
            ]

        self.optimizer = torch.optim.AdamW(param_groups, lr=lr, betas=betas)
        return self.optimizer


In [71]:
# train_re.py
import os
from dataclasses import dataclass
from typing import List, Optional
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, TrainingArguments
from sklearn.model_selection import train_test_split

# from trainer_plus import TrainerPlus
# 아래 4개는 당신이 "그대로 유지"한 모듈에서 import 하세요.
# from your_module import RE_Dataset, load_data, tokenized_dataset, DEFAULT_LABEL_LIST

DEFAULT_LABEL_LIST = [
    'no_relation','org:top_members/employees','org:members','org:product','per:title',
    'org:alternate_names','per:employee_of','org:place_of_headquarters','per:product',
    'org:number_of_employees/members','per:children','per:place_of_residence',
    'per:alternate_names','per:other_family','per:colleagues','per:origin',
    'per:siblings','per:spouse','org:founded','org:political/religious_affiliation',
    'org:member_of','per:parents','org:dissolved','per:schools_attended',
    'per:date_of_death','per:date_of_birth','per:place_of_birth','per:place_of_death',
    'org:founded_by','per:religion'
]

@dataclass
class TrainConfig:
    model_name: str = "klue/roberta-base"
    output_dir: str = "./runs"
    num_train_epochs: int = 5
    learning_rate: float = 2e-5
    per_device_train_batch_size: int = 16
    per_device_eval_batch_size: int = 16
    warmup_ratio: float = 0.05
    weight_decay: float = 0.01
    logging_steps: int = 500
    save_steps: int = 500
    eval_steps: Optional[int] = 500
    save_total_limit: int = 3
    load_best_model_at_end: bool = False
    seed: int = 42
    max_length: int = 256
    fp16: bool = True
    # 입력 마커
    marker_variant: str = "typed"  # "typed" or "plain"
    # 규제/스케줄러
    label_smoothing: float = 0.1
    lr_scheduler_type: str = "cosine"  # "linear" | "cosine"
    # 손실/가중치/rdrop
    use_class_weight: bool = False
    use_focal: bool = False
    focal_gamma: float = 2.0
    rdrop_alpha: float = 0.0
    # LLRD
    use_llrd: bool = False
    llrd_decay: float = 0.95

def train_re(
    train_csv: str,
    dev_csv: Optional[str],
    label_list: List[str] = DEFAULT_LABEL_LIST,
    cfg: TrainConfig = TrainConfig(),
    save_best_to: str = "./best_model",
    RE_Dataset=None,
    load_data=None,
    tokenized_dataset=None,
):
    torch.manual_seed(cfg.seed); np.random.seed(cfg.seed)

    # --- Tokenizer/Model ---
    tokenizer = AutoTokenizer.from_pretrained(cfg.model_name, use_fast=True)
    special_tokens = ["[E1]","[/E1]","[E2]","[/E2]"]
    if cfg.marker_variant == "typed":
        special_tokens += ["[E1-PER]","[E2-PER]","[E1-ORG]","[E2-ORG]","[E1-LOC]","[E2-LOC]","[E1-UNK]","[E2-UNK]"]
    added = tokenizer.add_special_tokens({"additional_special_tokens": special_tokens})
    config = AutoConfig.from_pretrained(cfg.model_name, num_labels=len(label_list))
    model = AutoModelForSequenceClassification.from_pretrained(cfg.model_name, config=config)
    if added > 0:
        model.resize_token_embeddings(len(tokenizer))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # --- Data ---
    full_df = load_data(train_csv)
    if dev_csv:
        train_df = full_df
        dev_df = load_data(dev_csv)
    else:
        train_df, dev_df = train_test_split(
            full_df, test_size=0.1, random_state=cfg.seed, stratify=full_df['label']
        )

    label_map = {v: i for i, v in enumerate(label_list)}
    train_y = [label_map[v] for v in train_df['label'].values]
    dev_y   = [label_map[v] for v in dev_df['label'].values]

    tok_train = tokenized_dataset(train_df, tokenizer, max_len=cfg.max_length)
    tok_train.pop("token_type_ids", None)
    tok_dev   = tokenized_dataset(dev_df, tokenizer, max_len=cfg.max_length)
    tok_dev.pop("token_type_ids", None)

    RE_train = RE_Dataset(tok_train, train_y)
    RE_dev   = RE_Dataset(tok_dev, dev_y)

    # --- Class weights (선택) ---
    class_weights = None
    if cfg.use_class_weight:
        counts = np.bincount(train_y, minlength=len(label_list))
        inv = 1.0 / np.clip(counts, 1, None)
        weights = inv / inv.mean()
        class_weights = torch.tensor(weights, dtype=torch.float32, device=device)

    # --- Args ---
    evaluation_strategy = "steps"
    args = TrainingArguments(
        output_dir=cfg.output_dir,
        save_total_limit=cfg.save_total_limit,
        save_steps=cfg.save_steps,
        num_train_epochs=cfg.num_train_epochs,
        learning_rate=cfg.learning_rate,
        per_device_train_batch_size=cfg.per_device_train_batch_size,
        per_device_eval_batch_size=cfg.per_device_eval_batch_size,
        weight_decay=cfg.weight_decay,
        logging_steps=cfg.logging_steps,
        logging_strategy="steps",
        eval_strategy=evaluation_strategy,
        eval_steps=cfg.eval_steps,
        load_best_model_at_end=True,
        metric_for_best_model="micro f1 score",
        greater_is_better=True,
        seed=cfg.seed,
        fp16=cfg.fp16 and torch.cuda.is_available(),
        remove_unused_columns=False,
        dataloader_pin_memory=torch.cuda.is_available(),
        report_to="none",
        label_smoothing_factor=cfg.label_smoothing,
        lr_scheduler_type=cfg.lr_scheduler_type,
        warmup_ratio=cfg.warmup_ratio,
    )

    # --- Metrics ---
    import sklearn
    from sklearn.metrics import accuracy_score
    def micro_f1_wo_no_relation(preds, labels, label_list, no_rel="no_relation"):
        no_rel_idx = label_list.index(no_rel)
        use_labels = [i for i in range(len(label_list)) if i != no_rel_idx]
        return sklearn.metrics.f1_score(labels, preds, average="micro", labels=use_labels) * 100.0
    def auprc_all(probs, labels, num_labels):
        labels_oh = np.eye(num_labels)[labels]
        score = []
        for c in range(num_labels):
            t = labels_oh[:, c]; p = probs[:, c]
            prec, rec, _ = sklearn.metrics.precision_recall_curve(t, p)
            score.append(sklearn.metrics.auc(rec, prec))
        return float(np.mean(score) * 100.0)
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        if isinstance(logits, tuple):  # HF 버전 호환
            logits = logits[0]
        preds = logits.argmax(-1)

        # 확률(Softmax) → AUPRC 계산용
        probs = np.asarray(torch.softmax(torch.tensor(logits), dim=-1))

        micro_f1 = micro_f1_wo_no_relation(preds, labels, label_list)
        auprc    = auprc_all(probs, labels, num_labels=logits.shape[1])
        acc      = accuracy_score(labels, preds) * 100.0

        return {
            "micro f1 score": micro_f1,
            "auprc": auprc,
            "accuracy": acc,
        }

    # --- Trainer ---
    trainer = TrainerPlus(
        model=model,
        args=args,
        train_dataset=RE_train,
        eval_dataset=RE_dev,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        # plus options
        class_weights=class_weights,
        use_focal=cfg.use_focal,
        focal_gamma=cfg.focal_gamma,
        rdrop_alpha=cfg.rdrop_alpha,
        use_llrd=cfg.use_llrd,
        llrd_decay=cfg.llrd_decay,
        wd=cfg.weight_decay,
    )

    trainer.train()
    os.makedirs(save_best_to, exist_ok=True)
    trainer.save_model(save_best_to)
    if trainer.tokenizer: trainer.tokenizer.save_pretrained(save_best_to)
    return trainer


In [72]:
# grid_plus.py
import os, time, gc, shutil, pandas as pd, torch
from itertools import product
from dataclasses import replace
from typing import Optional, List, Dict, Any
import inspect
import numpy as np

# from train_re import TrainConfig, train_re, DEFAULT_LABEL_LIST
# 아래 4개는 당신이 유지한 구현을 인자로 넘겨서 사용
# from your_module import RE_Dataset, load_data, tokenized_dataset
from transformers import TrainerCallback
from transformers import EarlyStoppingCallback


class ConsoleLogger(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        print(f"[train] start → output_dir={args.output_dir}, "
              f"lr={args.learning_rate}, bsz={args.per_device_train_batch_size}, "
              f"epochs={args.num_train_epochs}, max_len={getattr(args, 'max_length', 'N/A')}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        # HF가 주기적으로 호출하는 로깅 지점. loss/learning_rate/epoch/step 등이 들어옴
        if not state.is_world_process_zero or not logs:
            return
        keys = ["loss","learning_rate","epoch"]
        msg = " | ".join([f"{k}={logs[k]:.5f}" for k in keys if k in logs])
        if "eval_micro f1 score" in logs:
            msg += (f" | microF1={logs['eval_micro f1 score']:.3f}"
                    f" | AUPRC={logs.get('eval_auprc', logs.get('auprc', float('nan'))):.3f}"
                    f" | acc={logs.get('eval_accuracy', logs.get('accuracy', float('nan'))):.3f}")
        print(f"[step {state.global_step}] {msg}")

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if state.is_world_process_zero and metrics:
            print("[eval] "
                  f"microF1={metrics.get('micro f1 score'):.3f} | "
                  f"AUPRC={metrics.get('auprc'):.3f} | "
                  f"acc={metrics.get('accuracy'):.3f}")

    def on_train_end(self, args, state, control, **kwargs):
        print(f"[train] end. best_ckpt={state.best_model_checkpoint}")


def run_grid_plus(
    train_csv: str,
    dev_csv: Optional[str],
    models: List[str],
    hp_space: Dict[str, list],
    base_out: str = "./grid_runs_plus",
    seed_list: List[int] = (42,),
    label_list: List[str] = None,
    RE_Dataset=None, load_data=None, tokenized_dataset=None,
    callbacks: Optional[List[TrainerCallback]] = None,   # ← 추가
) -> pd.DataFrame:
    if label_list is None:
        label_list = DEFAULT_LABEL_LIST
    os.makedirs(base_out, exist_ok=True)

    # 기본 콜백 구성
    default_callbacks: List[TrainerCallback] = [
        ConsoleLogger(),
        EarlyStoppingCallback(early_stopping_patience=5)
    ]
    use_callbacks = default_callbacks if callbacks is None else (default_callbacks + callbacks)

    keys, values = zip(*hp_space.items())
    combos = list(product(*values))
    total = len(models) * len(seed_list) * len(combos)
    results = []
    idx = 0

    for model_name in models:
        for seed in seed_list:
            for vals in combos:
                idx += 1
                opt = dict(zip(keys, vals))
                run_name = (
                    f"{model_name.replace('/','_')}"
                    f"_lr{opt['lr']}_ep{opt['epochs']}_bs{opt['train_bsz']}"
                    f"_ml{opt['max_len']}_sch{opt['scheduler']}"
                    f"_ls{opt['label_smoothing']}_mk{opt['marker_variant']}"
                    f"_cw{int(opt['use_class_weight'])}_fc{int(opt['use_focal'])}"
                    f"_rd{opt['rdrop_alpha']}_llrd{int(opt['use_llrd'])}_seed{seed}"
                )
                out_dir = os.path.join(base_out, run_name)
                best_dir = os.path.join(out_dir, "best")

                if torch.cuda.is_available(): torch.cuda.empty_cache()
                gc.collect()

                t0 = time.time()
                row = {
                    "model": model_name, "seed": seed, **opt,
                    "output_dir": out_dir, "best_dir": best_dir,
                    "micro_f1": None, "auprc": None, "accuracy": None,
                    "best_ckpt": None, "seconds": None, "error": None,
                }
                try:
                    cfg = TrainConfig(
                        model_name=model_name,
                        output_dir=out_dir,
                        num_train_epochs=opt["epochs"],
                        learning_rate=opt["lr"],
                        per_device_train_batch_size=opt["train_bsz"],
                        per_device_eval_batch_size=opt["train_bsz"],
                        warmup_ratio=opt["warmup_ratio"],
                        weight_decay=0.01,
                        logging_steps=500,
                        save_steps=500,
                        eval_steps=500,
                        save_total_limit=3,
                        load_best_model_at_end=True,
                        seed=seed,
                        max_length=opt["max_len"],
                        fp16=torch.cuda.is_available(),
                        marker_variant=opt["marker_variant"],
                        label_smoothing=opt["label_smoothing"],
                        lr_scheduler_type=opt["scheduler"],
                        use_class_weight=opt["use_class_weight"],
                        use_focal=opt["use_focal"],
                        focal_gamma=opt["focal_gamma"],
                        rdrop_alpha=opt["rdrop_alpha"],
                        use_llrd=opt["use_llrd"],
                        llrd_decay=opt["llrd_decay"],
                    )

                    # 1) train_re가 callbacks를 받으면 그대로 전달
                    accepts_callbacks = "callbacks" in inspect.signature(train_re).parameters
                    if accepts_callbacks:
                        trainer = train_re(
                            train_csv=train_csv,
                            dev_csv=dev_csv,
                            label_list=label_list,
                            cfg=cfg,
                            save_best_to=best_dir,
                            RE_Dataset=RE_Dataset,
                            load_data=load_data,
                            tokenized_dataset=tokenized_dataset,
                            callbacks=use_callbacks,   # ← 전달
                        )
                    else:
                        trainer = train_re(
                            train_csv=train_csv,
                            dev_csv=dev_csv,
                            label_list=label_list,
                            cfg=cfg,
                            save_best_to=best_dir,
                            RE_Dataset=RE_Dataset,
                            load_data=load_data,
                            tokenized_dataset=tokenized_dataset
                        )
                        # 2) 반환된 Trainer에 add_callback으로 주입
                        for cb in use_callbacks:
                            trainer.add_callback(cb)

                    metrics = trainer.evaluate()
                    row["micro_f1"] = metrics.get("micro f1 score")
                    row["auprc"]    = metrics.get("auprc")
                    row["accuracy"] = metrics.get("accuracy")
                    state = getattr(trainer, "state", None)
                    row["best_ckpt"] = getattr(state, "best_model_checkpoint", None)

                except Exception as e:
                    row["error"] = f"{type(e).__name__}: {e}"
                finally:
                    row["seconds"] = round(time.time() - t0, 2)
                    results.append(row)
                    if torch.cuda.is_available(): torch.cuda.empty_cache()
                    gc.collect()
                    shutil.rmtree(out_dir, ignore_errors=True)

    df = pd.DataFrame(results).sort_values(
        by=["micro_f1","auprc","accuracy"], ascending=False, na_position="last"
    ).reset_index(drop=True)
    df.to_csv(os.path.join(base_out, "param_grid_summary.csv"), index=False, encoding="utf-8-sig")
    return df

In [73]:
# run.py (노트북/스크립트에서)
# from grid_plus import run_grid_plus
# from train_re import DEFAULT_LABEL_LIST
# # 당신이 유지한 구현
# from your_module import RE_Dataset, load_data, tokenized_dataset

HP_SPACE = {
  "lr":              [1.5e-5, 2e-5, 2.5e-5],
  "epochs":          [10],          # 10에서 과적합 체크
  "train_bsz":       [16, 32],             # 32는 일단 제외
  "max_len":         [192, 256],
  "scheduler":       ["cosine"],
  "warmup_ratio":    [0.05, 0.1],      # 워밍업 살짝 확대 실험
  "label_smoothing": [0.0, 0.1],       # 간단 규제
  "marker_variant":  ["typed"],        # 현재 성능 좋은 쪽 고정
  "use_class_weight":[False, True],    # 불균형 완화 체크
  "use_focal":       [False],          # focal은 v3로 미룸
  "focal_gamma":     [2.0],
  "rdrop_alpha":     [0.0, 1.0],       # 가벼운 R-Drop
  "use_llrd":        [False, True],    # LLRD 효과 확인
  "llrd_decay":      [0.95],
}

MODELS = ["klue/roberta-base"]

df = run_grid_plus(
  train_csv="/content/drive/MyDrive/Colab Notebooks/upstage/dataset/train.csv",
  dev_csv=None,  # dev 없으면 내부에서 0.1 stratify split
  models=MODELS,
  hp_space=HP_SPACE,
  base_out="./grid_runs_plus",
  seed_list=[42],
  label_list=DEFAULT_LABEL_LIST,
  RE_Dataset=RE_Dataset,
  load_data=load_data,
  tokenized_dataset=tokenized_dataset,
)
df.head()


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Step,Training Loss,Validation Loss,Micro f1 score,Auprc,Accuracy
500,2.2482,1.778522,43.689788,24.49309,48.906683
1000,1.6528,1.562364,51.931602,30.952617,53.403141
1500,1.5018,1.441856,54.840925,32.977617,57.068063


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.


[step 1827] epoch=1.00000 | microF1=54.841 | AUPRC=32.978 | acc=57.068


Unnamed: 0,model,seed,lr,epochs,train_bsz,max_len,scheduler,warmup_ratio,label_smoothing,marker_variant,...,use_llrd,llrd_decay,output_dir,best_dir,micro_f1,auprc,accuracy,best_ckpt,seconds,error
0,klue/roberta-base,42,1.5e-05,1,16,192,cosine,0.05,0.01,typed,...,False,0.95,./grid_runs_plus/klue_roberta-base_lr1.5e-05_e...,./grid_runs_plus/klue_roberta-base_lr1.5e-05_e...,,,,,410.73,TypeError: unsupported format string passed to...


In [68]:
df['error'].iloc[0]

"TypeError: TrainerPlus.compute_loss() got an unexpected keyword argument 'num_items_in_batch'"

In [69]:
run_grid_plus(
  train_csv="/content/drive/MyDrive/Colab Notebooks/upstage/dataset/train.csv",
  dev_csv=None,  # dev 없으면 내부에서 0.1 stratify split
  models=MODELS,
  hp_space=HP_SPACE,
  base_out="./grid_runs_plus",
  seed_list=[42],
  label_list=DEFAULT_LABEL_LIST,
  RE_Dataset=RE_Dataset,
  load_data=load_data,
  tokenized_dataset=tokenized_dataset,
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Unnamed: 0,model,seed,lr,epochs,train_bsz,max_len,scheduler,warmup_ratio,label_smoothing,marker_variant,...,use_llrd,llrd_decay,output_dir,best_dir,micro_f1,auprc,accuracy,best_ckpt,seconds,error
0,klue/roberta-base,42,1.5e-05,1,16,192,cosine,0.05,0.01,typed,...,False,0.95,./grid_runs_plus/klue_roberta-base_lr1.5e-05_e...,./grid_runs_plus/klue_roberta-base_lr1.5e-05_e...,,,,,11.97,TypeError: TrainerPlus.compute_loss() got an u...
