<a href="https://colab.research.google.com/github/Best-Island-LCH/AI_NLP_Team08/blob/main/softlabel_kjm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# =========================================
# 0) Install & Imports
# =========================================
!pip install -q torch transformers datasets pandas scikit-learn tqdm

from google.colab import drive
drive.mount('/content/drive/')

import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    set_seed,
)
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Mounted at /content/drive/
Using device: cuda


In [55]:
# =========================================
# 1) Config
# =========================================
MODEL_NAME = "klue/roberta-base"   # 더 올리고 싶으면 "klue/roberta-large"도 가능(Colab GPU 메모리 체크)
MAX_LENGTH = 256                  # 512는 느리고 과적합/노이즈도 늘 수 있음 (필요 시 384/512로)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_EPOCHS = 5
SEED = 42

CRITERIA = [
    "linguistic_acceptability",
    "consistency",
    "interestingness",
    "unbias",
    "harmlessness",
    "no_hallucination",
    "understandability",
    "sensibleness",
    "specificity",
]
NUM_LABELS = len(CRITERIA)

set_seed(SEED)


In [3]:
# =========================================
# 2) Load CSV
# =========================================
train_path = "/content/drive/MyDrive/nlp_ai/data/train/training_all_aggregated.csv"
val_path   = "/content/drive/MyDrive/nlp_ai/data/val/validation_all_aggregated.csv"

train_df = pd.read_csv(train_path, encoding="utf-8-sig")
val_df   = pd.read_csv(val_path, encoding="utf-8-sig")

print(f"Training 데이터: {len(train_df):,}개")
print(f"Validation 데이터: {len(val_df):,}개")
print(train_df.columns.tolist())


Training 데이터: 400,572개
Validation 데이터: 50,047개
['source_file', 'conversation_id', 'topic', 'num_evaluators', 'exchange_id', 'utterance_id', 'utterance_index', 'num_evaluations', 'human_question', 'bot_response', 'bot_response_length', 'linguistic_acceptability_yes_count', 'linguistic_acceptability_no_count', 'linguistic_acceptability_majority', 'linguistic_acceptability_unanimous', 'consistency_yes_count', 'consistency_no_count', 'consistency_majority', 'consistency_unanimous', 'interestingness_yes_count', 'interestingness_no_count', 'interestingness_majority', 'interestingness_unanimous', 'unbias_yes_count', 'unbias_no_count', 'unbias_majority', 'unbias_unanimous', 'harmlessness_yes_count', 'harmlessness_no_count', 'harmlessness_majority', 'harmlessness_unanimous', 'no_hallucination_yes_count', 'no_hallucination_no_count', 'no_hallucination_majority', 'no_hallucination_unanimous', 'understandability_yes_count', 'understandability_no_count', 'understandability_majority', 'understandabi

In [4]:
# =========================================
# 3) Preprocess
#    - train: soft label (yes_count / num_evaluators)
#    - val: hard label (majority)
# =========================================
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["human_question"] = df["human_question"].fillna("")
    df["bot_response"]   = df["bot_response"].fillna("")

    # 필수 컬럼 체크
    required = ["num_evaluators", "human_question", "bot_response"]
    for c in required:
        if c not in df.columns:
            raise ValueError(f"Missing column: {c}")

    # 타깃 관련 필수 컬럼 체크
    for crit in CRITERIA:
        if f"{crit}_yes_count" not in df.columns:
            raise ValueError(f"Missing column: {crit}_yes_count")
        if f"{crit}_majority" not in df.columns:
            raise ValueError(f"Missing column: {crit}_majority")

    # 결측 제거 (질문/응답은 빈 문자열로 대체했으니 타깃만 체크)
    target_cols = []
    for crit in CRITERIA:
        target_cols += [f"{crit}_yes_count", f"{crit}_majority"]
    target_cols += ["num_evaluators"]
    df = df.dropna(subset=target_cols)

    # num_evaluators 안전 처리
    df["num_evaluators"] = df["num_evaluators"].astype(float)
    df = df[df["num_evaluators"] > 0]

    # soft label 생성
    for crit in CRITERIA:
        yes = df[f"{crit}_yes_count"].astype(float)
        n   = df["num_evaluators"]
        df[f"{crit}_soft"] = (yes / n).clip(0.0, 1.0)

    # hard label 정수화
    for crit in CRITERIA:
        df[f"{crit}_majority"] = df[f"{crit}_majority"].astype(int)

    return df.reset_index(drop=True)

train_df = preprocess(train_df)
val_df = preprocess(val_df)

print(f"전처리 후 Training: {len(train_df):,}")
print(f"전처리 후 Validation: {len(val_df):,}")
train_df.head(2)


전처리 후 Training: 400,572
전처리 후 Validation: 50,047


Unnamed: 0,source_file,conversation_id,topic,num_evaluators,exchange_id,utterance_id,utterance_index,num_evaluations,human_question,bot_response,...,specificity_unanimous,linguistic_acceptability_soft,consistency_soft,interestingness_soft,unbias_soft,harmlessness_soft,no_hallucination_soft,understandability_soft,sensibleness_soft,specificity_soft
0,경제활동_상품상거래_209.json,864,"경제활동, 상품/상거래",5.0,c864.e1,c864.u2,1,3,"저번 주 포스코홀딩스의 실적이 나왔는데, 어떤 내용이었어?",포스코홀딩스의 3분기 실적은 철강시황 악화와 타이푼 힌남노로 인한 침수 피해로 인해...,...,0,0.0,0.6,0.0,0.6,0.6,0.6,0.6,0.6,0.2
1,경제활동_상품상거래_209.json,864,"경제활동, 상품/상거래",5.0,c864.e2,c864.u4,3,3,영업이익도 많이 감소했으려나?,영업이익은 71% 감소하여 9천억원이었습ㄴㅣ다.,...,1,0.0,0.6,0.0,0.6,0.6,0.6,0.6,0.6,0.6


In [5]:
# =========================================
# 4) Tokenizer (Cross-Encoder: text_pair로 넣기)
# =========================================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
print("Tokenizer loaded:", MODEL_NAME)
print("sep_token:", tokenizer.sep_token)


tokenizer_config.json:   0%|          | 0.00/375 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Tokenizer loaded: klue/roberta-base
sep_token: [SEP]


In [6]:
# =========================================
# 5) Dataset
#    - train_dataset: soft labels (float)
#    - val_dataset:   hard labels (0/1)로 평가 지표 유지
# =========================================
class QualityEvalDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer, max_length: int, label_mode: str = "soft"):
        """
        label_mode:
          - "soft": {crit}_soft 사용 (float in [0,1])
          - "hard": {crit}_majority 사용 (int 0/1)
        """
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        assert label_mode in ["soft", "hard"]
        self.label_mode = label_mode

        if label_mode == "soft":
            self.target_cols = [f"{c}_soft" for c in CRITERIA]
        else:
            self.target_cols = [f"{c}_majority" for c in CRITERIA]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        q = row["human_question"]
        a = row["bot_response"]

        # Cross-Encoder 입력: tokenizer(question, response)
        enc = self.tokenizer(
            q,
            a,
            truncation=True,
            max_length=self.max_length,
            # padding은 collator가 dynamic으로 해줄 거라 여기선 안 함
        )

        labels = row[self.target_cols].values.astype(np.float32)

        return {
            "input_ids": enc["input_ids"],
            "attention_mask": enc["attention_mask"],
            "labels": labels,
        }

train_dataset = QualityEvalDataset(train_df, tokenizer, MAX_LENGTH, label_mode="soft")
val_dataset   = QualityEvalDataset(val_df, tokenizer, MAX_LENGTH, label_mode="hard")

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest")

print("Train size:", len(train_dataset))
print("Val size:", len(val_dataset))


Train size: 400572
Val size: 50047


In [7]:
# =========================================
# 6) Model
#    multi-label → BCEWithLogitsLoss가 내부에서 사용됨 (labels float도 OK)
# =========================================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS,
    problem_type="multi_label_classification",
)
model.to(device)

print("Model loaded:", MODEL_NAME)
print("Params:", sum(p.numel() for p in model.parameters()))


config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: klue/roberta-base
Params: 110625033


In [8]:
# =========================================
# 7) Metrics (hard 기준으로 기존과 동일하게 측정)
# =========================================
def compute_metrics(eval_pred):
    logits, labels = eval_pred  # labels는 val_dataset에서 hard(0/1)

    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)
    labels = labels.astype(int)

    exact_match = np.all(preds == labels, axis=1).mean()
    micro_f1 = f1_score(labels, preds, average="micro", zero_division=0)
    macro_f1 = f1_score(labels, preds, average="macro", zero_division=0)

    per_label_acc = (preds == labels).mean(axis=0)

    metrics = {
        "exact_match": float(exact_match),
        "micro_f1": float(micro_f1),
        "macro_f1": float(macro_f1),
    }
    for i, c in enumerate(CRITERIA):
        metrics[f"{c}_acc"] = float(per_label_acc[i])
    return metrics


In [52]:
import torch.nn.functional as F
from transformers import Trainer

# 라벨 인덱스 맵
IDX = {c: i for i, c in enumerate(CRITERIA)}

class WeightedBCEWithConstraintsTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels").to(model.device)  # (bs, 9)
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.sigmoid(logits)

        # ----- weighted BCE -----
        y = labels
        loss_pos = -w_pos.to(model.device) * y * F.logsigmoid(logits)
        loss_neg = -w_neg.to(model.device) * (1 - y) * F.logsigmoid(-logits)
        bce = (loss_pos + loss_neg).mean()

        # ----- constraint penalty -----
        p_cons = probs[:, IDX["consistency"]]
        p_ling = probs[:, IDX["linguistic_acceptability"]]
        p_und  = probs[:, IDX["understandability"]]
        p_sens = probs[:, IDX["sensibleness"]]

        penalty = (
            F.relu(p_cons - p_ling) +
            F.relu(p_cons - p_und) +
            F.relu(p_cons - p_sens)
        ).mean()

        LAMBDA = 0.5
        loss = bce + LAMBDA * penalty

        return (loss, outputs) if return_outputs else loss


In [56]:
training_args = TrainingArguments(
    output_dir="./outputs/soft-cross-encoder",
    num_train_epochs=NUM_EPOCHS,

    per_device_train_batch_size=64,   # = BATCH_SIZE*4 (BATCH_SIZE=32일 때)
    per_device_eval_batch_size=128,    # = BATCH_SIZE*8

    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    warmup_ratio=0.1,

    logging_dir="./logs",
    logging_steps=100,

    eval_strategy="epoch",        # ✅ 여기!
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,

    report_to="none",
    seed=SEED,

    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
)


In [57]:
# =========================================
# 9) Trainer
# =========================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,   # soft
    eval_dataset=val_dataset,       # hard
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("학습 시작...")
trainer.train()
print("학습 완료!")


학습 시작...


Epoch,Training Loss,Validation Loss,Exact Match,Micro F1,Macro F1,Linguistic Acceptability Acc,Consistency Acc,Interestingness Acc,Unbias Acc,Harmlessness Acc,No Hallucination Acc,Understandability Acc,Sensibleness Acc,Specificity Acc
1,0.5538,0.330486,0.696885,0.969979,0.969661,0.96833,0.952145,0.929266,0.974864,0.986972,0.892101,0.927188,0.943553,0.950846
2,0.5526,0.346997,0.692809,0.968349,0.968142,0.96813,0.948109,0.915719,0.974244,0.987292,0.897277,0.924251,0.94723,0.939377
3,0.5501,0.346624,0.693388,0.967787,0.967542,0.969309,0.946111,0.915899,0.974804,0.986972,0.89316,0.922853,0.947529,0.936799
4,0.5439,0.344905,0.693049,0.967899,0.967746,0.966671,0.948928,0.915899,0.974184,0.987012,0.898356,0.921933,0.94645,0.934981
5,0.5416,0.348491,0.692169,0.967868,0.967668,0.96807,0.947649,0.917398,0.974024,0.986433,0.894799,0.922553,0.94683,0.9363


학습 완료!


In [58]:
# =========================================
# 10) Evaluate
# =========================================
eval_results = trainer.evaluate()

print("=" * 60)
print("평가 결과")
print("=" * 60)
for k, v in eval_results.items():
    if any(x in k for x in ["loss", "f1", "match"]):
        print(f"{k}: {v:.4f}")

print("\n" + "=" * 60)
print("기준별 정확도")
print("=" * 60)
for c in CRITERIA:
    key = f"eval_{c}_acc"
    if key in eval_results:
        print(f"{c}: {eval_results[key]:.4f}")


평가 결과
eval_loss: 0.3305
eval_exact_match: 0.6969
eval_micro_f1: 0.9700
eval_macro_f1: 0.9697

기준별 정확도
linguistic_acceptability: 0.9683
consistency: 0.9521
interestingness: 0.9293
unbias: 0.9749
harmlessness: 0.9870
no_hallucination: 0.8921
understandability: 0.9272
sensibleness: 0.9436
specificity: 0.9508


In [59]:
# =========================================
# 11) Save
# =========================================
save_path = "./outputs/soft-cross-encoder-final3"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print("Saved to:", save_path)


Saved to: ./outputs/soft-cross-encoder-final3


In [79]:
# =========================================
# 12) Inference helper (cross-encoder)
# =========================================
@torch.no_grad()
def predict(question: str, response: str, model, tokenizer, device):
    model.eval()
    enc = tokenizer(
        question,
        response,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt",
    )
    enc = {k: v.to(device) for k, v in enc.items()}
    logits = model(**enc).logits
    probs = torch.sigmoid(logits).cpu().numpy()[0]
    preds = (probs > 0.6).astype(int)

    out = {}
    for i, c in enumerate(CRITERIA):
        out[c] = {"prediction": int(preds[i]), "probability": float(probs[i])}
    return out

sample_q = "물은 몇도에서 끓어?"
sample_a = "물은 13도에서 끓어"
res = predict(sample_q, sample_a, model, tokenizer, device)
for c, v in res.items():
    print(("✓" if v["prediction"]==1 else "✗"), c, f"{v['probability']:.2%}")


✓ linguistic_acceptability 69.86%
✓ consistency 75.24%
✓ interestingness 72.81%
✓ unbias 75.07%
✓ harmlessness 75.60%
✓ no_hallucination 70.31%
✓ understandability 74.48%
✓ sensibleness 69.60%
✓ specificity 72.95%


In [42]:
train_df["consistency_majority"].mean()
val_df["consistency_majority"].mean()


np.float64(0.8724199252702459)