# Task

In [3]:
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
data_path1 = "merged_with_domain_vote_test.csv"
df1 = pd.read_csv(data_path1)

In [None]:
data_path = "merged_with_domain_vote_train.csv"
df = pd.read_csv(data_path)
print(df.head())
print(df.columns)

  utterance_id linguistic_acceptability consistency interestingness unbias  \
0       c1.u10                       no         yes             yes    yes   
1       c1.u12                       no         yes             yes    yes   
2       c1.u14                       no         yes             yes    yes   
3       c1.u16                       no         yes             yes    yes   
4        c1.u2                       no         yes             yes    yes   

  harmlessness no_hallucination understandability sensibleness specificity  \
0          yes              yes               yes          yes         yes   
1          yes              yes               yes          yes         yes   
2          yes              yes               yes          yes         yes   
3          yes              yes               yes          yes         yes   
4          yes              yes                no          yes         yes   

                                                text  
0  네, 최

In [6]:
def create_conversation_history(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # conversation_id / utterance_num 분리
    df["conversation_id"] = df["utterance_id"].apply(
        lambda x: x.split(".")[0]
    )
    df["utterance_num"] = df["utterance_id"].apply(
        lambda x: int(x.split(".u")[1])
    )

    # 대화별 + 순서별 정렬
    df = (
        df.sort_values(by=["conversation_id", "utterance_num"])
          .reset_index(drop=True)
    )

    # context 컬럼
    df["context"] = ""

    for conv_id, group in df.groupby("conversation_id"):
        history = []
        for idx in group.index:
            df.loc[idx, "context"] = " ".join(history)
            history.append(df.loc[idx, "text"])

    return df.drop(columns=["utterance_num"])


In [10]:
df = create_conversation_history(df)

In [11]:
df["input_text"] = df["context"] + " " + df["text"]
TEXT_COL = "input_text"

df = df.dropna(subset=[TEXT_COL])
df = df[df[TEXT_COL].str.strip() != ""]
df = df.reset_index(drop=True)

In [12]:
LABEL_COLS = [
    "linguistic_acceptability",
    "consistency",
    "interestingness",
    "unbias",
    "harmlessness",
    "no_hallucination",
    "understandability",
    "sensibleness",
    "specificity"
]

label_map = {"yes": 1, "no": 0}

for col in LABEL_COLS:
    df[col] = df[col].map(label_map)

df[LABEL_COLS] = df[LABEL_COLS].apply(pd.to_numeric)

In [15]:
MODEL_NAME = "monologg/koelectra-base-v3-discriminator"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

encodings = tokenizer(
    df[TEXT_COL].tolist(),
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

labels = torch.tensor(
    df[LABEL_COLS].values,
    dtype=torch.float
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/61.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [16]:
class QualityDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

In [17]:
dataset = QualityDataset(encodings, labels)

In [18]:
train_idx, valid_idx = train_test_split(
    range(len(dataset)),
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_dataset = Subset(dataset, train_idx)
valid_dataset = Subset(dataset, valid_idx)

In [19]:
train_loader = DataLoader(
    train_dataset,
    batch_size=16,
    shuffle=True
)

valid_loader = DataLoader(
    valid_dataset,
    batch_size=16,
    shuffle=False
)

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=9,
    problem_type="multi_label_classification"
)

model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

pytorch_model.bin:   0%|          | 0.00/452M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    step = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            )

            loss = F.binary_cross_entropy_with_logits(
                outputs.logits,
                batch["labels"]
            )

            total_loss += loss.item()
            step += 1

    model.train()
    return total_loss / step

In [16]:
EPOCHS = 3

model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    step = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()

        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

        loss = F.binary_cross_entropy_with_logits(
            outputs.logits,
            batch["labels"]
        )

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        step += 1

    train_loss = total_loss / step
    valid_loss = evaluate(model, valid_loader, device)

    print(
        f"Epoch {epoch+1}/{EPOCHS} | "
        f"Train Loss: {train_loss:.4f} | "
        f"Valid Loss: {valid_loss:.4f}"
    )

Epoch 1:   0%|          | 0/20029 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/452M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 20029/20029 [1:54:12<00:00,  2.92it/s]


Epoch 1/3 | Train Loss: 0.1598 | Valid Loss: 0.1472


Epoch 2: 100%|██████████| 20029/20029 [1:54:14<00:00,  2.92it/s]


Epoch 2/3 | Train Loss: 0.1329 | Valid Loss: 0.1375


Epoch 3: 100%|██████████| 20029/20029 [1:54:15<00:00,  2.92it/s]


Epoch 3/3 | Train Loss: 0.1179 | Valid Loss: 0.1387


In [None]:
## 모델 저장
SAVE_PATH = "koelectra1_with_context_256"

model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

In [26]:
## 모델 저장
SAVE_PATH = "/content/drive/MyDrive/멋쟁이 사자처럼 AI NLP /실전 프로젝트1/koelectra1_with_context_256"

In [13]:
df1_processed = df1.copy()

# Apply the conversation history creation to df1_processed
df1_processed = create_conversation_history(df1_processed)

# Create the input_text column for df1_processed
df1_processed["input_text"] = df1_processed["context"] + " " + df1_processed["text"]

df1_processed = df1_processed.dropna(subset=[TEXT_COL])
df1_processed = df1_processed[df1_processed[TEXT_COL].str.strip() != ""]
df1_processed = df1_processed.reset_index(drop=True)

for col in LABEL_COLS:
    df1_processed[col] = df1_processed[col].map(label_map)

df1_processed[LABEL_COLS] = df1_processed[LABEL_COLS].apply(pd.to_numeric)

print(df1_processed.head())
print(df1_processed.shape)

  utterance_id  linguistic_acceptability  consistency  interestingness  \
0    c10028.u2                         1            1                0   
1    c10028.u4                         0            1                0   
2    c10028.u6                         1            1                1   
3    c10028.u8                         1            1                1   
4   c10028.u10                         1            1                1   

   unbias  harmlessness  no_hallucination  understandability  sensibleness  \
0       1             1                 1                  1             1   
1       1             1                 1                  1             1   
2       1             1                 1                  1             1   
3       1             1                 1                  1             1   
4       1             1                 1                  1             1   

   specificity                                               text  \
0            0   

In [None]:
eval_texts = df1_processed[TEXT_COL].tolist()

eval_encodings = tokenizer(
    eval_texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

eval_labels = torch.tensor(
    df1_processed[LABEL_COLS].values,
    dtype=torch.float
)

eval_dataset = QualityDataset(eval_encodings, eval_labels)

eval_loader = DataLoader(
    eval_dataset,
    batch_size=16,
    shuffle=False # No need to shuffle for evaluation
)

print(f"Evaluation dataset size: {len(eval_dataset)}")

Evaluation dataset size: 50047


In [28]:
from sklearn.metrics import f1_score, precision_score, recall_score, hamming_loss
import numpy as np
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm.notebook import tqdm # Use tqdm.notebook for Colab compatibility

# Define device to ensure it's available in this scope
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Reload the saved model and tokenizer to ensure they are available
loaded_tokenizer = AutoTokenizer.from_pretrained(SAVE_PATH)
loaded_model = AutoModelForSequenceClassification.from_pretrained(SAVE_PATH)
loaded_model.to(device)

loaded_model.eval() # Ensure model is in evaluation mode

all_labels = []
all_preds = []

with torch.no_grad():
    for batch in tqdm(eval_loader, desc="Calculating Metrics"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = loaded_model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"]
        )

        logits = outputs.logits
        probabilities = torch.sigmoid(logits) # Convert logits to probabilities
        predictions = (probabilities > 0.5).int() # Apply a threshold to get binary predictions

        all_labels.append(batch["labels"].cpu().numpy())
        all_preds.append(predictions.cpu().numpy())

# Concatenate all labels and predictions
all_labels = np.concatenate(all_labels, axis=0)
all_preds = np.concatenate(all_preds, axis=0)

# Calculate metrics per label
precision_per_label = precision_score(all_labels, all_preds, average=None, zero_division=0)
recall_per_label = recall_score(all_labels, all_preds, average=None, zero_division=0)
f1_per_label = f1_score(all_labels, all_preds, average=None, zero_division=0)

# Calculate overall micro-averaged metrics
overall_precision = precision_score(all_labels, all_preds, average='micro', zero_division=0)
overall_recall = recall_score(all_labels, all_preds, average='micro', zero_division=0)
overall_f1 = f1_score(all_labels, all_preds, average='micro', zero_division=0)

# Calculate overall macro-averaged metrics
macro_precision = precision_score(all_labels, all_preds, average='macro', zero_division=0)
macro_recall = recall_score(all_labels, all_preds, average='macro', zero_division=0)
macro_f1 = f1_score(all_labels, all_preds, average='macro', zero_division=0)

print("\n--- Metrics Per Label ---")
for i, label_name in enumerate(LABEL_COLS):
    print(f"{label_name}:\n  Precision: {precision_per_label[i]:.4f}\n  Recall:    {recall_per_label[i]:.4f}\n  F1-score:  {f1_per_label[i]:.4f}")

print("\n--- Overall Micro-averaged Metrics ---")
print(f"Overall Precision: {overall_precision:.4f}")
print(f"Overall Recall:    {overall_recall:.4f}")
print(f"Overall F1-score:  {overall_f1:.4f}")

print("\n--- Overall Macro-averaged Metrics ---")
print(f"Macro Precision: {macro_precision:.4f}")
print(f"Macro Recall:    {macro_recall:.4f}")
print(f"Macro F1-score:  {macro_f1:.4f}")

Calculating Metrics:   0%|          | 0/3128 [00:00<?, ?it/s]


--- Metrics Per Label ---
linguistic_acceptability:
  Precision: 0.9251
  Recall:    0.9856
  F1-score:  0.9544
consistency:
  Precision: 0.9591
  Recall:    0.9623
  F1-score:  0.9607
interestingness:
  Precision: 0.9408
  Recall:    0.9952
  F1-score:  0.9672
unbias:
  Precision: 0.9324
  Recall:    0.9892
  F1-score:  0.9600
harmlessness:
  Precision: 0.9223
  Recall:    0.9872
  F1-score:  0.9536
no_hallucination:
  Precision: 0.9028
  Recall:    0.9372
  F1-score:  0.9196
understandability:
  Precision: 0.9349
  Recall:    0.9691
  F1-score:  0.9517
sensibleness:
  Precision: 0.9400
  Recall:    0.9659
  F1-score:  0.9528
specificity:
  Precision: 0.9528
  Recall:    0.9983
  F1-score:  0.9750

--- Overall Micro-averaged Metrics ---
Overall Precision: 0.9348
Overall Recall:    0.9772
Overall F1-score:  0.9555

--- Overall Macro-averaged Metrics ---
Macro Precision: 0.9345
Macro Recall:    0.9767
Macro F1-score:  0.9550
