In [None]:
# ===============================
# KoBERT Multi-label Training & Evaluation
# ===============================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel
from kobert_transformers import get_tokenizer
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    hamming_loss, classification_report
)
from tqdm import tqdm

# -------------------------------
# 1. 데이터 로드
# -------------------------------
df_train_split= pd.read_csv("/content/drive/MyDrive/data/AI응답 결과에 대한 품질 평가 데이터/train_df.csv")
df_val_split = pd.read_csv("/content/drive/MyDrive/data/AI응답 결과에 대한 품질 평가 데이터/val_df.csv")
df_test = pd.read_csv("/content/drive/MyDrive/data/AI응답 결과에 대한 품질 평가 데이터/test_df.csv")

label_cols = [
    "linguistic_acceptability", "consistency", "interestingness",
    "unbias", "harmlessness", "no_hallucination",
    "understandability", "sensibleness", "specificity"
]

# yes / no → 1 / 0 변환
for col in label_cols:
    df_train_split[col] = df_train_split[col].map({"yes": 1, "no": 0})
    df_val_split[col]  = df_val_split[col].map({"yes": 1, "no": 0})
    df_test[col]  = df_test[col].map({"yes": 1, "no": 0})

df_train_split = df_train_split[["text"] + label_cols].dropna()
df_val_split  = df_val_split[["text"] + label_cols].dropna()
df_test  = df_test[["text"] + label_cols].dropna()

print(f"Train samples: {len(df_train_split_split)}")
print(f"Validation samples: {len(df_val_split)}")
print(f"Test samples: {len(df_test)} (최종 평가용)")

# -------------------------------
# 2. Dataset 정의
# -------------------------------
class KoBERTDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# -------------------------------
# 3. Tokenizer & DataLoader
# -------------------------------
tokenizer = get_tokenizer()

train_dataset = KoBERTDataset(
    df_train_split["text"].tolist(),
    df_train_split[label_cols].values.tolist(),
    tokenizer
)
val_dataset = KoBERTDataset(
    df_val_split["text"].tolist(),
    df_val_split[label_cols].values.tolist(),
    tokenizer
)
test_dataset = KoBERTDataset(
    df_test["text"].tolist(),
    df_test[label_cols].values.tolist(),
    tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)

# -------------------------------
# 4. 모델 정의
# -------------------------------
class KoBERTMultiLabel(nn.Module):
    def __init__(self, num_labels, dropout=0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained("skt/kobert-base-v1")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = self.dropout(outputs.pooler_output)
        return self.classifier(pooled_output)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = KoBERTMultiLabel(num_labels=len(label_cols)).to(device)

# -------------------------------
# 5. Optimizer & Loss
# -------------------------------
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.BCEWithLogitsLoss()

# -------------------------------
# 6. 평가 함수
# -------------------------------
def evaluate_model(model, dataloader, threshold=0.5):
    """
    Multi-label classification 평가
    Returns: dict with overall and per-label metrics
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > threshold).float()

            all_probs.append(probs.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    all_probs = np.vstack(all_probs)

    # Overall metrics
    avg_loss = total_loss / len(dataloader)
    hamming = hamming_loss(all_labels, all_preds)

    # Subset accuracy (exact match)
    subset_acc = accuracy_score(all_labels, all_preds)

    # Per-sample accuracy (at least one correct)
    sample_acc = np.mean([
        accuracy_score(all_labels[i], all_preds[i])
        for i in range(len(all_labels))
    ])

    # Macro/Micro metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='macro', zero_division=0
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='micro', zero_division=0
    )

    # Per-label metrics
    per_label_metrics = {}
    for idx, label_name in enumerate(label_cols):
        precision, recall, f1, support = precision_recall_fscore_support(
            all_labels[:, idx], all_preds[:, idx],
            average='binary', zero_division=0
        )
        per_label_metrics[label_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': int(all_labels[:, idx].sum())
        }

    results = {
        'loss': avg_loss,
        'hamming_loss': hamming,
        'subset_accuracy': subset_acc,
        'sample_accuracy': sample_acc,
        'macro': {
            'precision': precision_macro,
            'recall': recall_macro,
            'f1': f1_macro
        },
        'micro': {
            'precision': precision_micro,
            'recall': recall_micro,
            'f1': f1_micro
        },
        'per_label': per_label_metrics
    }

    return results, all_probs

def print_evaluation_results(results, phase="Test"):
    """평가 결과 출력"""
    print(f"\n{'='*60}")
    print(f"{phase} Evaluation Results")
    print(f"{'='*60}")
    print(f"Loss: {results['loss']:.4f}")
    print(f"Hamming Loss: {results['hamming_loss']:.4f}")
    print(f"Subset Accuracy (Exact Match): {results['subset_accuracy']:.4f}")
    print(f"Sample Accuracy (Average): {results['sample_accuracy']:.4f}")

    print(f"\n--- Macro Metrics ---")
    print(f"Precision: {results['macro']['precision']:.4f}")
    print(f"Recall: {results['macro']['recall']:.4f}")
    print(f"F1-Score: {results['macro']['f1']:.4f}")

    print(f"\n--- Micro Metrics ---")
    print(f"Precision: {results['micro']['precision']:.4f}")
    print(f"Recall: {results['micro']['recall']:.4f}")
    print(f"F1-Score: {results['micro']['f1']:.4f}")

    print(f"\n--- Per-Label Metrics ---")
    print(f"{'Label':<30} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
    print("-" * 76)
    for label_name, metrics in results['per_label'].items():
        print(f"{label_name:<30} "
              f"{metrics['precision']:<12.4f} "
              f"{metrics['recall']:<12.4f} "
              f"{metrics['f1']:<12.4f} "
              f"{metrics['support']:<10}")
    print(f"{'='*60}\n")

# -------------------------------
# 7. 학습 루프
# -------------------------------
EPOCHS = 5
best_f1 = 0
best_model_state = None

for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_loader)
    print(f"\n[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f}")

    # Validation
    val_results, _ = evaluate_model(model, val_loader)
    print_evaluation_results(val_results, phase=f"Epoch {epoch+1} Validation")

    # Save best model based on validation F1
    current_f1 = val_results['macro']['f1']
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_model_state = model.state_dict().copy()
        print(f"✓ New best model saved! (Validation F1: {best_f1:.4f})")

# -------------------------------
# 8. 최종 평가 (Test 데이터)
# -------------------------------
print("\n" + "="*60)
print("Loading best model for FINAL evaluation on TEST set...")
print("="*60)

model.load_state_dict(best_model_state)
final_results, final_probs = evaluate_model(model, test_loader)
print_evaluation_results(final_results, phase="Final Test")

# -------------------------------
# 9. 모델 저장
# -------------------------------
SAVE_PATH = "content/drive/MyDrive/data/AI응답 결과에 대한 품질 평가 데이터/kobert_multilabel_best.pt"
torch.save({
    'model_state_dict': best_model_state,
    'label_cols': label_cols,
    'best_val_f1': best_f1,
    'test_results': final_results
}, SAVE_PATH)
print(f"\n✓ Best model saved to: {SAVE_PATH}")
print(f"  Best Validation F1-Score: {best_f1:.4f}")
print(f"  Final Test F1-Score: {final_results['macro']['f1']:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Train samples: 320457
Validation samples: 80115
Test samples: 50047 (최종 평가용)
Using device: cuda


Epoch 1/5 [Train]: 100%|██████████| 20029/20029 [23:37<00:00, 14.13it/s, loss=0.234] 



[Epoch 1] Train Loss: 0.1675


                                                               


Epoch 1 Validation Evaluation Results
Loss: 0.1530
Hamming Loss: 0.0521
Subset Accuracy (Exact Match): 0.6983
Sample Accuracy (Average): 0.9479

--- Macro Metrics ---
Precision: 0.9535
Recall: 0.9887
F1-Score: 0.9706

--- Micro Metrics ---
Precision: 0.9532
Recall: 0.9889
F1-Score: 0.9707

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9681       0.9946       0.9812       67772     
consistency                    0.9532       0.9908       0.9717       70048     
interestingness                0.9408       0.9991       0.9691       72682     
unbias                         0.9911       0.9794       0.9852       71084     
harmlessness                   0.9923       0.9922       0.9923       69380     
no_hallucination               0.9113       0.9695       0.9395       63862     
understandability              0.9373 

Epoch 2/5 [Train]: 100%|██████████| 20029/20029 [23:12<00:00, 14.39it/s, loss=0.177] 



[Epoch 2] Train Loss: 0.1444


                                                               


Epoch 2 Validation Evaluation Results
Loss: 0.1499
Hamming Loss: 0.0515
Subset Accuracy (Exact Match): 0.7008
Sample Accuracy (Average): 0.9485

--- Macro Metrics ---
Precision: 0.9585
Recall: 0.9835
F1-Score: 0.9707

--- Micro Metrics ---
Precision: 0.9583
Recall: 0.9839
F1-Score: 0.9709

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9712       0.9946       0.9827       67772     
consistency                    0.9638       0.9761       0.9699       70048     
interestingness                0.9416       0.9986       0.9693       72682     
unbias                         0.9917       0.9786       0.9851       71084     
harmlessness                   0.9936       0.9908       0.9922       69380     
no_hallucination               0.9253       0.9518       0.9383       63862     
understandability              0.9479 

Epoch 3/5 [Train]: 100%|██████████| 20029/20029 [23:07<00:00, 14.43it/s, loss=0.183] 



[Epoch 3] Train Loss: 0.1331


                                                               


Epoch 3 Validation Evaluation Results
Loss: 0.1483
Hamming Loss: 0.0506
Subset Accuracy (Exact Match): 0.7049
Sample Accuracy (Average): 0.9494

--- Macro Metrics ---
Precision: 0.9574
Recall: 0.9860
F1-Score: 0.9713

--- Micro Metrics ---
Precision: 0.9571
Recall: 0.9863
F1-Score: 0.9715

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9733       0.9936       0.9834       67772     
consistency                    0.9616       0.9820       0.9717       70048     
interestingness                0.9414       0.9986       0.9692       72682     
unbias                         0.9927       0.9783       0.9855       71084     
harmlessness                   0.9942       0.9904       0.9923       69380     
no_hallucination               0.9186       0.9635       0.9405       63862     
understandability              0.9458 

Epoch 4/5 [Train]: 100%|██████████| 20029/20029 [23:11<00:00, 14.39it/s, loss=0.146] 



[Epoch 4] Train Loss: 0.1221


                                                               


Epoch 4 Validation Evaluation Results
Loss: 0.1526
Hamming Loss: 0.0511
Subset Accuracy (Exact Match): 0.7037
Sample Accuracy (Average): 0.9489

--- Macro Metrics ---
Precision: 0.9554
Recall: 0.9875
F1-Score: 0.9711

--- Micro Metrics ---
Precision: 0.9553
Recall: 0.9877
F1-Score: 0.9712

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9730       0.9945       0.9837       67772     
consistency                    0.9603       0.9841       0.9720       70048     
interestingness                0.9425       0.9967       0.9689       72682     
unbias                         0.9789       0.9896       0.9842       71084     
harmlessness                   0.9934       0.9910       0.9922       69380     
no_hallucination               0.9125       0.9688       0.9398       63862     
understandability              0.9483 

Epoch 5/5 [Train]: 100%|██████████| 20029/20029 [22:51<00:00, 14.61it/s, loss=0.0818]



[Epoch 5] Train Loss: 0.1111


                                                               


Epoch 5 Validation Evaluation Results
Loss: 0.1608
Hamming Loss: 0.0532
Subset Accuracy (Exact Match): 0.6902
Sample Accuracy (Average): 0.9468

--- Macro Metrics ---
Precision: 0.9605
Recall: 0.9789
F1-Score: 0.9695

--- Micro Metrics ---
Precision: 0.9604
Recall: 0.9794
F1-Score: 0.9698

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9732       0.9941       0.9836       67772     
consistency                    0.9662       0.9703       0.9683       70048     
interestingness                0.9439       0.9928       0.9677       72682     
unbias                         0.9891       0.9830       0.9861       71084     
harmlessness                   0.9939       0.9900       0.9919       69380     
no_hallucination               0.9310       0.9371       0.9341       63862     
understandability              0.9506 

                                                               


Final Test Evaluation Results
Loss: 0.1685
Hamming Loss: 0.0546
Subset Accuracy (Exact Match): 0.6849
Sample Accuracy (Average): 0.9454

--- Macro Metrics ---
Precision: 0.9592
Recall: 0.9786
F1-Score: 0.9687

--- Micro Metrics ---
Precision: 0.9591
Recall: 0.9791
F1-Score: 0.9690

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9711       0.9943       0.9825       42440     
consistency                    0.9660       0.9699       0.9679       43662     
interestingness                0.9430       0.9929       0.9673       45411     
unbias                         0.9881       0.9827       0.9854       44457     
harmlessness                   0.9936       0.9903       0.9919       43331     
no_hallucination               0.9291       0.9377       0.9334       39753     
understandability              0.9465       0.