In [None]:
# ===============================
# KR-BERT Multi-label Training & Evaluation
# ===============================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer  # 변경: 표준 transformers 사용
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    hamming_loss, classification_report
)
from tqdm import tqdm
from google.colab import drive

# -------------------------------
# 1. 데이터 로드
# -------------------------------

drive_path = '/content/drive'
drive.mount(drive_path)

TRAIN_PATH = '/content/drive/MyDrive/quality_ai_data/train/merged_with_domain_vote_train.csv'
TEST_PATH  = '/content/drive/MyDrive/quality_ai_data/train/merged_with_domain_vote_test.csv'

df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)

label_cols = [
    "linguistic_acceptability", "consistency", "interestingness",
    "unbias", "harmlessness", "no_hallucination",
    "understandability", "sensibleness", "specificity"
]

# yes / no → 1 / 0 변환
for col in label_cols:
    df_train[col] = df_train[col].map({"yes": 1, "no": 0})
    df_test[col]  = df_test[col].map({"yes": 1, "no": 0})

df_train = df_train[["text"] + label_cols].dropna()
df_test  = df_test[["text"] + label_cols].dropna()

# Train 데이터를 8:2로 train/validation 분할
df_train_split, df_val_split = train_test_split(
    df_train,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"Train samples: {len(df_train_split)}")
print(f"Validation samples: {len(df_val_split)}")
print(f"Test samples: {len(df_test)} (최종 평가용)")

# -------------------------------
# 2. Dataset 정의
# -------------------------------
class KRBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# -------------------------------
# 3. Tokenizer & DataLoader (변경됨)
# -------------------------------
# KR-BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("snunlp/KR-BERT-char16424")

train_dataset = KRBERTDataset(
    df_train_split["text"].tolist(),
    df_train_split[label_cols].values.tolist(),
    tokenizer
)
val_dataset = KRBERTDataset(
    df_val_split["text"].tolist(),
    df_val_split[label_cols].values.tolist(),
    tokenizer
)
test_dataset = KRBERTDataset(
    df_test["text"].tolist(),
    df_test[label_cols].values.tolist(),
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# -------------------------------
# 4. 모델 정의 (변경됨)
# -------------------------------
class KRBERTMultiLabel(nn.Module):
    def __init__(self, num_labels, dropout=0.1):
        super().__init__()
        # KR-BERT 모델 로드
        self.bert = BertModel.from_pretrained("snunlp/KR-BERT-char16424")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = self.dropout(outputs.pooler_output)
        return self.classifier(pooled_output)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = KRBERTMultiLabel(num_labels=len(label_cols)).to(device)

# -------------------------------
# 5. Optimizer & Loss
# -------------------------------
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.BCEWithLogitsLoss()

# -------------------------------
# 6. 평가 함수
# -------------------------------
def evaluate_model(model, dataloader, threshold=0.5):
    """
    Multi-label classification 평가
    Returns: dict with overall and per-label metrics
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > threshold).float()

            all_probs.append(probs.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    all_probs = np.vstack(all_probs)

    # Overall metrics
    avg_loss = total_loss / len(dataloader)
    hamming = hamming_loss(all_labels, all_preds)

    # Subset accuracy (exact match)
    subset_acc = accuracy_score(all_labels, all_preds)

    # Per-sample accuracy (at least one correct)
    sample_acc = np.mean([
        accuracy_score(all_labels[i], all_preds[i])
        for i in range(len(all_labels))
    ])

    # Macro/Micro metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='macro', zero_division=0
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='micro', zero_division=0
    )

    # Per-label metrics
    per_label_metrics = {}
    for idx, label_name in enumerate(label_cols):
        precision, recall, f1, support = precision_recall_fscore_support(
            all_labels[:, idx], all_preds[:, idx],
            average='binary', zero_division=0
        )
        per_label_metrics[label_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': int(all_labels[:, idx].sum())
        }

    results = {
        'loss': avg_loss,
        'hamming_loss': hamming,
        'subset_accuracy': subset_acc,
        'sample_accuracy': sample_acc,
        'macro': {
            'precision': precision_macro,
            'recall': recall_macro,
            'f1': f1_macro
        },
        'micro': {
            'precision': precision_micro,
            'recall': recall_micro,
            'f1': f1_micro
        },
        'per_label': per_label_metrics
    }

    return results, all_probs

def print_evaluation_results(results, phase="Test"):
    """평가 결과 출력"""
    print(f"\n{'='*60}")
    print(f"{phase} Evaluation Results")
    print(f"{'='*60}")
    print(f"Loss: {results['loss']:.4f}")
    print(f"Hamming Loss: {results['hamming_loss']:.4f}")
    print(f"Subset Accuracy (Exact Match): {results['subset_accuracy']:.4f}")
    print(f"Sample Accuracy (Average): {results['sample_accuracy']:.4f}")

    print(f"\n--- Macro Metrics ---")
    print(f"Precision: {results['macro']['precision']:.4f}")
    print(f"Recall: {results['macro']['recall']:.4f}")
    print(f"F1-Score: {results['macro']['f1']:.4f}")

    print(f"\n--- Micro Metrics ---")
    print(f"Precision: {results['micro']['precision']:.4f}")
    print(f"Recall: {results['micro']['recall']:.4f}")
    print(f"F1-Score: {results['micro']['f1']:.4f}")

    print(f"\n--- Per-Label Metrics ---")
    print(f"{'Label':<30} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
    print("-" * 76)
    for label_name, metrics in results['per_label'].items():
        print(f"{label_name:<30} "
              f"{metrics['precision']:<12.4f} "
              f"{metrics['recall']:<12.4f} "
              f"{metrics['f1']:<12.4f} "
              f"{metrics['support']:<10}")
    print(f"{'='*60}\n")

# -------------------------------
# 7. 학습 루프
# -------------------------------
EPOCHS = 5
best_f1 = 0
best_model_state = None

for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_loader)
    print(f"\n[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f}")

    # Validation
    val_results, _ = evaluate_model(model, val_loader)
    print_evaluation_results(val_results, phase=f"Epoch {epoch+1} Validation")

    # Save best model based on validation F1
    current_f1 = val_results['macro']['f1']
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_model_state = model.state_dict().copy()
        print(f"✓ New best model saved! (Validation F1: {best_f1:.4f})")

# -------------------------------
# 8. 최종 평가 (Test 데이터)
# -------------------------------
print("\n" + "="*60)
print("Loading best model for FINAL evaluation on TEST set...")
print("="*60)

model.load_state_dict(best_model_state)
final_results, final_probs = evaluate_model(model, test_loader)
print_evaluation_results(final_results, phase="Final Test")

# -------------------------------
# 9. 모델 저장
# -------------------------------
SAVE_PATH = "/content/drive/MyDrive/krbert_multilabel_best.pt"
torch.save({
    'model_state_dict': best_model_state,
    'label_cols': label_cols,
    'best_val_f1': best_f1,
    'test_results': final_results
}, SAVE_PATH)
print(f"\n✓ Best model saved to: {SAVE_PATH}")
print(f"  Best Validation F1-Score: {best_f1:.4f}")
print(f"  Final Test F1-Score: {final_results['macro']['f1']:.4f}")



Mounted at /content/drive
Train samples: 320457
Validation samples: 80115
Test samples: 50047 (최종 평가용)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Using device: cpu


pytorch_model.bin:   0%|          | 0.00/397M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/397M [00:00<?, ?B/s]


Epoch 1/5 [Train]:   0%|          | 0/20029 [00:00<?, ?it/s][A