In [2]:
# ===============================
# KoBERT Multi-label Training & Evaluation (with Context)
# ===============================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel
from kobert_transformers import get_tokenizer
from torch.optim import AdamW
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    hamming_loss, classification_report
)
from tqdm import tqdm

# -------------------------------
# 1. 데이터 로드
# -------------------------------
df_train_split = pd.read_csv("C:/Users/joseo/OneDrive/바탕 화면/train_df.csv")
df_val_split = pd.read_csv("C:/Users/joseo/OneDrive/바탕 화면/val_df.csv")
df_test = pd.read_csv("C:/Users/joseo/OneDrive/바탕 화면/test_df.csv")

label_cols = [
    "linguistic_acceptability", "consistency", "interestingness",
    "unbias", "harmlessness", "no_hallucination",
    "understandability", "sensibleness", "specificity"
]

# yes / no → 1 / 0 변환
for col in label_cols:
    df_train_split[col] = df_train_split[col].map({"yes": 1, "no": 0})
    df_val_split[col]  = df_val_split[col].map({"yes": 1, "no": 0})
    df_test[col]  = df_test[col].map({"yes": 1, "no": 0})

# context와 text 컬럼 모두 포함
df_train_split = df_train_split[["context", "text"] + label_cols].dropna()
df_val_split  = df_val_split[["context", "text"] + label_cols].dropna()
df_test  = df_test[["context", "text"] + label_cols].dropna()

print(f"Train samples: {len(df_train_split)}")
print(f"Validation samples: {len(df_val_split)}")
print(f"Test samples: {len(df_test)} (최종 평가용)")

# -------------------------------
# 2. Dataset 정의 (Context + Text)
# -------------------------------
class KoBERTDataset(Dataset):
    def __init__(self, contexts, texts, labels, tokenizer, max_len=512):
        """
        contexts: 질문/맥락
        texts: AI 응답
        labels: 9개 품질 라벨
        """
        self.contexts = contexts
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # BERT 입력 형식: [CLS] context [SEP] text [SEP]
        encoding = self.tokenizer(
            self.contexts[idx],
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float)
        }

# -------------------------------
# 3. Tokenizer & DataLoader
# -------------------------------
tokenizer = get_tokenizer()

train_dataset = KoBERTDataset(
    df_train_split["context"].tolist(),
    df_train_split["text"].tolist(),
    df_train_split[label_cols].values.tolist(),
    tokenizer,
    max_len=512  # BERT 최대 길이로 증가
)
val_dataset = KoBERTDataset(
    df_val_split["context"].tolist(),
    df_val_split["text"].tolist(),
    df_val_split[label_cols].values.tolist(),
    tokenizer,
    max_len=512
)
test_dataset = KoBERTDataset(
    df_test["context"].tolist(),
    df_test["text"].tolist(),
    df_test[label_cols].values.tolist(),
    tokenizer,
    max_len=512
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)  # batch size 감소
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# -------------------------------
# 4. 모델 정의
# -------------------------------
class KoBERTMultiLabel(nn.Module):
    def __init__(self, num_labels, dropout=0.1):
        super().__init__()
        self.bert = BertModel.from_pretrained("skt/kobert-base-v1")
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = self.dropout(outputs.pooler_output)
        return self.classifier(pooled_output)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

model = KoBERTMultiLabel(num_labels=len(label_cols)).to(device)

# -------------------------------
# 5. Optimizer & Loss
# -------------------------------
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.BCEWithLogitsLoss()

# -------------------------------
# 6. 평가 함수
# -------------------------------
def evaluate_model(model, dataloader, threshold=0.5):
    """
    Multi-label classification 평가
    Returns: dict with overall and per-label metrics
    """
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []
    total_loss = 0

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(logits)
            preds = (probs > threshold).float()

            all_probs.append(probs.cpu().numpy())
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())

    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)
    all_probs = np.vstack(all_probs)

    # Overall metrics
    avg_loss = total_loss / len(dataloader)
    hamming = hamming_loss(all_labels, all_preds)

    # Subset accuracy (exact match)
    subset_acc = accuracy_score(all_labels, all_preds)

    # Per-sample accuracy (at least one correct)
    sample_acc = np.mean([
        accuracy_score(all_labels[i], all_preds[i])
        for i in range(len(all_labels))
    ])

    # Macro/Micro metrics
    precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='macro', zero_division=0
    )
    precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='micro', zero_division=0
    )

    # Per-label metrics
    per_label_metrics = {}
    for idx, label_name in enumerate(label_cols):
        precision, recall, f1, support = precision_recall_fscore_support(
            all_labels[:, idx], all_preds[:, idx],
            average='binary', zero_division=0
        )
        per_label_metrics[label_name] = {
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'support': int(all_labels[:, idx].sum())
        }

    results = {
        'loss': avg_loss,
        'hamming_loss': hamming,
        'subset_accuracy': subset_acc,
        'sample_accuracy': sample_acc,
        'macro': {
            'precision': precision_macro,
            'recall': recall_macro,
            'f1': f1_macro
        },
        'micro': {
            'precision': precision_micro,
            'recall': recall_micro,
            'f1': f1_micro
        },
        'per_label': per_label_metrics
    }

    return results, all_probs

def print_evaluation_results(results, phase="Test"):
    """평가 결과 출력"""
    print(f"\n{'='*60}")
    print(f"{phase} Evaluation Results")
    print(f"{'='*60}")
    print(f"Loss: {results['loss']:.4f}")
    print(f"Hamming Loss: {results['hamming_loss']:.4f}")
    print(f"Subset Accuracy (Exact Match): {results['subset_accuracy']:.4f}")
    print(f"Sample Accuracy (Average): {results['sample_accuracy']:.4f}")

    print(f"\n--- Macro Metrics ---")
    print(f"Precision: {results['macro']['precision']:.4f}")
    print(f"Recall: {results['macro']['recall']:.4f}")
    print(f"F1-Score: {results['macro']['f1']:.4f}")

    print(f"\n--- Micro Metrics ---")
    print(f"Precision: {results['micro']['precision']:.4f}")
    print(f"Recall: {results['micro']['recall']:.4f}")
    print(f"F1-Score: {results['micro']['f1']:.4f}")

    print(f"\n--- Per-Label Metrics ---")
    print(f"{'Label':<30} {'Precision':<12} {'Recall':<12} {'F1':<12} {'Support':<10}")
    print("-" * 76)
    for label_name, metrics in results['per_label'].items():
        print(f"{label_name:<30} "
              f"{metrics['precision']:<12.4f} "
              f"{metrics['recall']:<12.4f} "
              f"{metrics['f1']:<12.4f} "
              f"{metrics['support']:<10}")
    print(f"{'='*60}\n")

# -------------------------------
# 7. 학습 루프
# -------------------------------
EPOCHS = 5
best_f1 = 0
best_model_state = None

for epoch in range(EPOCHS):
    # Training
    model.train()
    total_loss = 0

    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_train_loss = total_loss / len(train_loader)
    print(f"\n[Epoch {epoch+1}] Train Loss: {avg_train_loss:.4f}")

    # Validation
    val_results, _ = evaluate_model(model, val_loader)
    print_evaluation_results(val_results, phase=f"Epoch {epoch+1} Validation")

    # Save best model based on validation F1
    current_f1 = val_results['macro']['f1']
    if current_f1 > best_f1:
        best_f1 = current_f1
        best_model_state = model.state_dict().copy()
        print(f"✓ New best model saved! (Validation F1: {best_f1:.4f})")

# -------------------------------
# 8. 최종 평가 (Test 데이터)
# -------------------------------
print("\n" + "="*60)
print("Loading best model for FINAL evaluation on TEST set...")
print("="*60)

model.load_state_dict(best_model_state)
final_results, final_probs = evaluate_model(model, test_loader)
print_evaluation_results(final_results, phase="Final Test")

# -------------------------------
# 9. 모델 저장
# -------------------------------
SAVE_PATH = "C:/Users/joseo/OneDrive/바탕 화면/kobert_multilabel_context_best.pt"
torch.save({
    'model_state_dict': best_model_state,
    'label_cols': label_cols,
    'best_val_f1': best_f1,
    'test_results': final_results
}, SAVE_PATH)
print(f"\n✓ Best model saved to: {SAVE_PATH}")
print(f"  Best Validation F1-Score: {best_f1:.4f}")
print(f"  Final Test F1-Score: {final_results['macro']['f1']:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


Train samples: 319621
Validation samples: 44365
Test samples: 35496 (최종 평가용)
Using device: cuda


Epoch 1/5 [Train]:   0%|          | 9/39953 [00:01<1:36:09,  6.92it/s, loss=0.553]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1/5 [Train]:   0%|          | 36/39953 [00:05<1:32:21,  7.20it/s, loss=0.407]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1/5 [Train]:   0%|          | 57/39953 [00:08<1:32:51,  7.16it/s, loss=0.368]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 1/5 [Train]:   1%|          | 214/39953 [00:29<1:31:53,  7.21it/s, l


[Epoch 1] Train Loss: 0.1995


Evaluating:   9%|▉         | 493/5546 [00:24<04:09, 20.28it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:  13%|█▎        | 740/5546 [00:36<03:55, 20.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if som


Epoch 1 Validation Evaluation Results
Loss: 0.1705
Hamming Loss: 0.0614
Subset Accuracy (Exact Match): 0.6579
Sample Accuracy (Average): 0.9386

--- Macro Metrics ---
Precision: 0.9513
Recall: 0.9786
F1-Score: 0.9646

--- Micro Metrics ---
Precision: 0.9512
Recall: 0.9792
F1-Score: 0.9650

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9559       0.9855       0.9705       37125     
consistency                    0.9593       0.9660       0.9626       38176     
interestingness                0.9366       0.9965       0.9656       40317     
unbias                         0.9813       0.9828       0.9820       38785     
harmlessness                   0.9881       0.9907       0.9894       37666     
no_hallucination               0.9172       0.9377       0.9273       34436     
understandability              0.9440 

Epoch 2/5 [Train]:   0%|          | 42/39953 [00:06<1:32:00,  7.23it/s, loss=0.218] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2/5 [Train]:   0%|          | 47/39953 [00:06<1:31:35,  7.26it/s, loss=0.166] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2/5 [Train]:   0%|          | 75/39953 [00:10<1:31:49,  7.24it/s, loss=0.274] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 2/5 [Train]:   0%|          | 154/39953 [00:21<1:32:17,  7.19it/


[Epoch 2] Train Loss: 0.1601


Evaluating:   9%|▉         | 492/5546 [00:23<04:07, 20.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:  13%|█▎        | 738/5546 [00:35<03:51, 20.77it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if som


Epoch 2 Validation Evaluation Results
Loss: 0.1603
Hamming Loss: 0.0574
Subset Accuracy (Exact Match): 0.6769
Sample Accuracy (Average): 0.9426

--- Macro Metrics ---
Precision: 0.9506
Recall: 0.9845
F1-Score: 0.9671

--- Micro Metrics ---
Precision: 0.9505
Recall: 0.9849
F1-Score: 0.9674

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9593       0.9924       0.9755       37125     
consistency                    0.9533       0.9791       0.9660       38176     
interestingness                0.9361       0.9989       0.9665       40317     
unbias                         0.9871       0.9802       0.9836       38785     
harmlessness                   0.9884       0.9913       0.9898       37666     
no_hallucination               0.9056       0.9597       0.9319       34436     
understandability              0.9428 

Epoch 3/5 [Train]:   0%|          | 13/39953 [00:01<1:32:39,  7.18it/s, loss=0.286] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3/5 [Train]:   0%|          | 44/39953 [00:06<1:31:39,  7.26it/s, loss=0.18]  Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3/5 [Train]:   0%|          | 71/39953 [00:09<1:31:50,  7.24it/s, loss=0.0661]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 3/5 [Train]:   0%|          | 128/39953 [00:17<1:31:53,  7.22it/


[Epoch 3] Train Loss: 0.1467


Evaluating:   9%|▉         | 491/5546 [00:23<04:06, 20.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:  13%|█▎        | 740/5546 [00:35<03:53, 20.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if som


Epoch 3 Validation Evaluation Results
Loss: 0.1592
Hamming Loss: 0.0570
Subset Accuracy (Exact Match): 0.6761
Sample Accuracy (Average): 0.9430

--- Macro Metrics ---
Precision: 0.9527
Recall: 0.9826
F1-Score: 0.9673

--- Micro Metrics ---
Precision: 0.9525
Recall: 0.9830
F1-Score: 0.9675

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9667       0.9840       0.9753       37125     
consistency                    0.9509       0.9866       0.9684       38176     
interestingness                0.9432       0.9855       0.9639       40317     
unbias                         0.9847       0.9853       0.9850       38785     
harmlessness                   0.9892       0.9922       0.9907       37666     
no_hallucination               0.9165       0.9538       0.9348       34436     
understandability              0.9314 

Epoch 4/5 [Train]:   0%|          | 13/39953 [00:01<1:32:36,  7.19it/s, loss=0.0644]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/5 [Train]:   0%|          | 74/39953 [00:10<1:30:36,  7.33it/s, loss=0.061] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/5 [Train]:   0%|          | 133/39953 [00:18<1:32:15,  7.19it/s, loss=0.0399]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 4/5 [Train]:   0%|          | 136/39953 [00:18<1:32:40,  7.16it


[Epoch 4] Train Loss: 0.1354


Evaluating:   9%|▉         | 492/5546 [00:23<04:05, 20.60it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:  13%|█▎        | 738/5546 [00:35<03:52, 20.65it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if som


Epoch 4 Validation Evaluation Results
Loss: 0.1628
Hamming Loss: 0.0557
Subset Accuracy (Exact Match): 0.6852
Sample Accuracy (Average): 0.9443

--- Macro Metrics ---
Precision: 0.9537
Recall: 0.9831
F1-Score: 0.9680

--- Micro Metrics ---
Precision: 0.9535
Recall: 0.9834
F1-Score: 0.9682

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9670       0.9887       0.9777       37125     
consistency                    0.9593       0.9761       0.9676       38176     
interestingness                0.9409       0.9912       0.9654       40317     
unbias                         0.9834       0.9866       0.9850       38785     
harmlessness                   0.9914       0.9886       0.9900       37666     
no_hallucination               0.9127       0.9592       0.9353       34436     
understandability              0.9443 

Epoch 5/5 [Train]:   0%|          | 12/39953 [00:01<1:32:56,  7.16it/s, loss=0.174]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5/5 [Train]:   0%|          | 117/39953 [00:16<1:30:09,  7.36it/s, loss=0.236] Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5/5 [Train]:   0%|          | 120/39953 [00:16<1:28:55,  7.47it/s, loss=0.0871]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Epoch 5/5 [Train]:   0%|          | 160/39953 [00:22<1:31:41,  7.23it


[Epoch 5] Train Loss: 0.1243


Evaluating:   9%|▉         | 492/5546 [00:23<04:05, 20.57it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:  13%|█▎        | 738/5546 [00:35<03:53, 20.56it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if som


Epoch 5 Validation Evaluation Results
Loss: 0.1597
Hamming Loss: 0.0559
Subset Accuracy (Exact Match): 0.6805
Sample Accuracy (Average): 0.9441

--- Macro Metrics ---
Precision: 0.9563
Recall: 0.9799
F1-Score: 0.9679

--- Micro Metrics ---
Precision: 0.9562
Recall: 0.9802
F1-Score: 0.9681

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9668       0.9908       0.9787       37125     
consistency                    0.9593       0.9782       0.9687       38176     
interestingness                0.9434       0.9845       0.9635       40317     
unbias                         0.9864       0.9830       0.9847       38785     
harmlessness                   0.9901       0.9924       0.9913       37666     
no_hallucination               0.9161       0.9564       0.9358       34436     
understandability              0.9508 

Evaluating:   2%|▏         | 73/4437 [00:03<03:27, 21.05it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   9%|▊         | 388/4437 [00:18<03:13, 20.94it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Evaluating:   9%|▉         | 397/4437 [00:19<03:14, 20.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation st


Final Test Evaluation Results
Loss: 0.1553
Hamming Loss: 0.0551
Subset Accuracy (Exact Match): 0.6843
Sample Accuracy (Average): 0.9449

--- Macro Metrics ---
Precision: 0.9575
Recall: 0.9796
F1-Score: 0.9684

--- Micro Metrics ---
Precision: 0.9575
Recall: 0.9798
F1-Score: 0.9685

--- Per-Label Metrics ---
Label                          Precision    Recall       F1           Support   
----------------------------------------------------------------------------
linguistic_acceptability       0.9701       0.9913       0.9806       29699     
consistency                    0.9608       0.9751       0.9679       30596     
interestingness                0.9439       0.9827       0.9629       32227     
unbias                         0.9882       0.9841       0.9861       31074     
harmlessness                   0.9907       0.9922       0.9914       30166     
no_hallucination               0.9186       0.9564       0.9372       27545     
understandability              0.9503       0.

In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU count: {torch.cuda.device_count()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.9.1+cu130
CUDA available: True
CUDA version: 13.0
GPU count: 1
GPU name: NVIDIA GeForce RTX 5070 Ti
