In [1]:
!pip install scikit-learn bitsandbytes peft accelerate transformers

Collecting bitsandbytes
  Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.49.1-py3-none-manylinux_2_24_x86_64.whl (59.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.49.1


In [2]:
import glob
import numpy as np
import pandas as pd
import os
import json
import zipfile

from datasets import Dataset
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import (
    DebertaV2ForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    BertModel,
    BertTokenizer
)
from sklearn.model_selection import GroupShuffleSplit

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, AutoModel
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

from IPython.display import display


In [3]:
import os
from google.colab import userdata

# Hugging Face
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.environ["HF_TOKEN"]  # 안전빵

# Weights & Biases
# os.environ["WANDB_API_KEY"] = userdata.get("WANDB_API_KEY")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
# 데이터 호출하기
from datasets import load_from_disk
encoded_test = load_from_disk("/content/drive/MyDrive/quality_ai_data/encoded_test")
encoded_val = load_from_disk("/content/drive/MyDrive/quality_ai_data/encoded_val")
encoded_train=load_from_disk("/content/drive/MyDrive/quality_ai_data/encoded_train")

In [6]:
labels = ['linguistic_acceptability', 'consistency', 'interestingness', 'unbias',
          'harmlessness', 'no_hallucination', 'understandability', 'sensibleness', 'specificity']

id2label = {idx: label for idx, label in enumerate(labels)}
label2id = {label: idx for idx, label in enumerate(labels)}

In [7]:
def load_csv_data(directory):
    all_files_in_dir = glob.glob(os.path.join(directory, "*.csv"))
    df_list = []
    print(f"Loading CSV files from: {directory}")

    if not all_files_in_dir:
        print(f"No CSV files found in {directory}. Returning empty DataFrame.")
        return pd.DataFrame()

    for file_path in all_files_in_dir:
        filename = os.path.basename(file_path)
        print(f"Attempting to load: {file_path}")
        current_df = None
        try:
            current_df = pd.read_csv(file_path, encoding="utf-8")
            print(f"Successfully loaded {filename} with utf-8 encoding.")
        except UnicodeDecodeError:
            try:
                current_df = pd.read_csv(file_path, encoding="cp949")
                print(f"Successfully loaded {filename} with cp949 encoding.")
            except UnicodeDecodeError:
                try:
                    current_df = pd.read_csv(file_path, encoding="euc-kr")
                    print(f"Successfully loaded {filename} with euc-kr encoding.")
                except pd.errors.EmptyDataError:
                    print(f"Skipping empty CSV file: {filename}")
                except FileNotFoundError:
                    print(f"File not found: {filename}")
                except Exception as e:
                    print(f"Error loading {filename} with euc-kr encoding: {e}")
            except pd.errors.EmptyDataError:
                print(f"Skipping empty CSV file: {filename}")
            except FileNotFoundError:
                print(f"File not found: {filename}")
            except Exception as e:
                print(f"An unexpected error occurred while loading {filename} with cp949 encoding: {e}")
        except pd.errors.EmptyDataError:
            print(f"Skipping empty CSV file: {filename}")
        except FileNotFoundError:
            print(f"File not found: {filename}")
        except Exception as e:
            print(f"An unexpected error occurred while loading {filename} with utf-8 encoding: {e}")

        if current_df is not None:
            df_list.append(current_df)

    if df_list:
        return pd.concat(df_list, ignore_index=True)
    else:
        print(f"No dataframes were successfully loaded from {directory}. Returning empty DataFrame.")
        return pd.DataFrame()

In [8]:
def create_conversation_history(df: pd.DataFrame, keep_last_n=5, sep_token=" [SEP] ") -> pd.DataFrame:
    df_with_history = df.copy()

    # utterance_id에서 숫자 추출 (예: "u3" -> 3)
    df_with_history["utterance_num"] = df_with_history["utterance_id"].apply(lambda x: int(str(x).split("u")[1]))
    df_with_history = df_with_history.sort_values(by=["conversation_id", "utterance_num"]).reset_index(drop=True)

    df_with_history["context"] = ""

    for conv_id, group in df_with_history.groupby("conversation_id"):
        history_list = []
        for idx in group.index:
            recent_history = history_list[-keep_last_n:]
            df_with_history.loc[idx, "context"] = sep_token.join(recent_history)
            history_list.append(df_with_history.loc[idx, "text"])

    df_with_history = df_with_history.drop(columns=["utterance_num"])
    return df_with_history

def consolidate_labels_by_voting(df: pd.DataFrame) -> pd.DataFrame:
    label_to_numeric = {"yes": 1, "no": 0}

    df_numeric = df.copy()
    for label_col in labels:
        df_numeric[label_col] = df_numeric[label_col].map(label_to_numeric)

    aggregation_dict = {"text": "first"}
    for label_col in labels:
        aggregation_dict[label_col] = "sum"

    grouped_df = (
        df_numeric
        .groupby(["conversation_id", "utterance_id"])
        .agg(aggregation_dict)
        .reset_index()
    )

    # voting rule: 2명 이상 yes면 yes (예: 3명 중 2명 이상)
    for label_col in labels:
        grouped_df[label_col] = grouped_df[label_col].apply(lambda x: "yes" if x >= 2 else "no")

    return grouped_df

train_df = load_csv_data("/content/drive/MyDrive/quality_ai_data/csv_train_data")
val_df = load_csv_data("/content/drive/MyDrive/quality_ai_data/csv_val_data")
consolidated_train_df = consolidate_labels_by_voting(train_df)
consolidated_val_df = consolidate_labels_by_voting(val_df)

train_df_with_history = create_conversation_history(consolidated_train_df, keep_last_n=5)
val_df_with_history = create_conversation_history(consolidated_val_df, keep_last_n=5)

Loading CSV files from: /content/drive/MyDrive/quality_ai_data/csv_train_data
Attempting to load: /content/drive/MyDrive/quality_ai_data/csv_train_data/TL_TL_발화·대화단위 평가 데이터_TL_기술_과학.csv
Successfully loaded TL_TL_발화·대화단위 평가 데이터_TL_기술_과학.csv with utf-8 encoding.
Attempting to load: /content/drive/MyDrive/quality_ai_data/csv_train_data/TL_TL_발화·대화단위 평가 데이터_TL_미용_건강_식음료.csv
Successfully loaded TL_TL_발화·대화단위 평가 데이터_TL_미용_건강_식음료.csv with utf-8 encoding.
Attempting to load: /content/drive/MyDrive/quality_ai_data/csv_train_data/TL_TL_발화·대화단위 평가 데이터_TL_경제활동_상품상거래.csv
Successfully loaded TL_TL_발화·대화단위 평가 데이터_TL_경제활동_상품상거래.csv with utf-8 encoding.
Attempting to load: /content/drive/MyDrive/quality_ai_data/csv_train_data/TL_TL_발화·대화단위 평가 데이터_TL_엔터테인먼트_오락_예술.csv
Successfully loaded TL_TL_발화·대화단위 평가 데이터_TL_엔

In [9]:
tokenizer = AutoTokenizer.from_pretrained("snunlp/KR-BERT-char16424")

def preprocess_function(examples):
    # Segment A: text (평가 대상 발화)
    # Segment B: context (그전 대화 맥락)
    # truncation="only_second"는 전체 길이가 길어질 때
    # 평가 대상인 Segment A는 보존하고 맥락인 Segment B의 앞쪽을 자르라는 의미입니다.

    # 'text'나 'context'가 None(NaN)인 경우를 대비해 빈 문자열 처리
    texts = [str(t) if t is not None else "" for t in examples["text"]]
    contexts = [str(c) if c is not None else "" for c in examples["context"]]

    # 두 문장을 결합하여 토큰화 (truncation_side='left'에 의해 앞의 context부터 잘림)
    encoding = tokenizer(
        contexts, # 첫 번째 문장 (맥락)
        texts,    # 두 번째 문장 (현재 발화)
        padding="max_length",
        truncation=True, # 전체 순서에서 왼쪽부터 자름
        max_length=512
    )
    # 레이블 처리 (기존과 동일)
    labels_matrix = []
    for i in range(len(examples["text"])):
        label_vector = [1.0 if examples[label][i] == 'yes' else 0.0 for label in labels]
        labels_matrix.append(label_vector)
    encoding["labels"] = labels_matrix
    return encoding

def split_train_test_by_conversation(df: pd.DataFrame, test_size=0.1, seed=42):
    gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    groups = df["conversation_id"].values

    train_idx, test_idx = next(gss.split(df, groups=groups))
    train_split = df.iloc[train_idx].reset_index(drop=True)
    test_split = df.iloc[test_idx].reset_index(drop=True)
    return train_split, test_split


# Convert to HuggingFace Dataset, using the new DataFrames with history
train_split_df, test_split_df = split_train_test_by_conversation(
    train_df_with_history,
    test_size=0.1,   # 원하는 test 비율로 변경 가능
    seed=42
)

train_dataset = Dataset.from_pandas(train_df_with_history)
val_dataset = Dataset.from_pandas(val_df_with_history)
test_dataset  = Dataset.from_pandas(test_split_df)

print("Tokenizing Datasets with conversation history...")
encoded_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
encoded_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names)
encoded_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Tokenizing Datasets with conversation history...


Map:   0%|          | 0/400572 [00:00<?, ? examples/s]

Map:   0%|          | 0/50047 [00:00<?, ? examples/s]

Map:   0%|          | 0/40042 [00:00<?, ? examples/s]

In [10]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# 모델 호출하기
# tokenizer = BertTokenizer.from_pretrained("snunlp/KR-BERT-char16424")

# Fix: Use AutoModelForSequenceClassification to correctly load the BERT-based model
model = AutoModelForSequenceClassification.from_pretrained(
    "snunlp/KR-BERT-char16424",
    problem_type="multi_label_classification",
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id)

model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir="your-model",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    push_to_hub=True,
    hub_model_id="snunlp/KR-BERT-char16424",
)

def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average='micro')
    # accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
                'roc_auc': roc_auc}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

trainer = Trainer(
    model,
    training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()

pytorch_model.bin:   0%|          | 0.00/397M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-BERT-char16424 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/397M [00:00<?, ?B/s]

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss,F1,Roc Auc
1,0.1389,0.139325,0.972532,0.843251
2,0.1256,0.135961,0.97312,0.850452


TrainOutput(global_step=12518, training_loss=0.14088660386218527, metrics={'train_runtime': 16295.9652, 'train_samples_per_second': 49.162, 'train_steps_per_second': 0.768, 'total_flos': 2.108030915139748e+17, 'train_loss': 0.14088660386218527, 'epoch': 2.0})