In [5]:
!pip install transformers datasets torch
!pip install transformers accelerate

from datasets import load_dataset
from transformers import ElectraForTokenClassification, ElectraTokenizerFast, AdamW, get_scheduler
import torch
from torch.utils.data import DataLoader, Dataset
from torch.nn import CrossEntropyLoss




In [6]:
# 데이터셋 로드
ds = load_dataset("humane-lab/K-HATERS")

# 레이블 매핑
label_mapping = {
    "normal": 0,
    "offensive": 1,
    "L1_hate": 2,
    "L2_hate": 3
}

# 토크나이저 및 라벨 생성
tokenizer = ElectraTokenizerFast.from_pretrained("beomi/KcELECTRA-base-v2022")

def create_token_labels(texts, rationales, tokenizer):
    tokenized_texts = tokenizer(texts, truncation=True, padding=True, return_offsets_mapping=True)
    labels = []

    for i, (text, rationale_spans) in enumerate(zip(texts, rationales)):
        token_labels = [0] * len(tokenized_texts['input_ids'][i])
        for span in rationale_spans:
            start, end = span
            for idx, (offset_start, offset_end) in enumerate(tokenized_texts['offset_mapping'][i]):
                if offset_start >= start and offset_end <= end:
                    token_labels[idx] = 1
        labels.append(token_labels)

    tokenized_texts.pop('offset_mapping')
    tokenized_texts['labels'] = labels
    return tokenized_texts

# 데이터 준비
train_data = create_token_labels(ds['train']['text'], ds['train']['offensiveness_rationale'], tokenizer)
validation_data = create_token_labels(ds['validation']['text'], ds['validation']['offensiveness_rationale'], tokenizer)
test_data = create_token_labels(ds['test']['text'], ds['test']['offensiveness_rationale'], tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/4.93k [00:00<?, ?B/s]

train.jsonl:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

val.jsonl:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

test.jsonl:   0%|          | 0.00/2.77M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/172158 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/450k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/504 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'ElectraTokenizerFast'.


In [7]:
class CustomDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.encodings['labels'][idx], dtype=torch.long)
        }

# 데이터셋 생성
train_dataset = CustomDataset(train_data)
validation_dataset = CustomDataset(validation_data)
test_dataset = CustomDataset(test_data)

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=16)
test_loader = DataLoader(test_dataset, batch_size=16)


In [10]:
# 라벨 검증
for i in range(3):  # 처음 3개 샘플 확인
    print("Text:", ds['train']['text'][i])
    print("Labels:", train_data['labels'][i])


Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Labels: [0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
for i in range(len(train_data['input_ids'])):
    if len(train_data['input_ids'][i]) != len(train_data['labels'][i]):
        print(f"Mismatch in sample {i}: Input ID length = {len(train_data['input_ids'][i])}, Label length = {len(train_data['labels'][i])}")


In [12]:
#레이셔널 확인
print("Offensiveness rationale:", ds['train']['offensiveness_rationale'][0])


Offensiveness rationale: [[8, 10], [11, 14], [50, 51], [54, 57], [93, 96]]


In [13]:
from collections import Counter

label_distribution = Counter([label for labels in train_data['labels'] for label in labels])
print("Label distribution:", label_distribution)


Label distribution: Counter({0: 45875373, 1: 5255553})


In [14]:
def extract_rationale_from_offsets(text, rationale_offsets, tokenizer):
    """
    오프셋 매핑을 통해 레이셔널(유해) 포지션에 해당하는 원문 텍스트를 추출합니다.

    Args:
        text (str): 원문 텍스트.
        rationale_offsets (list): 유해 포지션의 리스트 (예: [[start1, end1], [start2, end2]]).
        tokenizer: Hugging Face tokenizer 객체.

    Returns:
        harmful_texts (list): 유해 텍스트 조각의 리스트.
    """
    # 토큰화와 오프셋 매핑 생성
    tokenized = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        padding=True
    )

    offsets = tokenized['offset_mapping']  # 오프셋 매핑

    # 유해 포지션에 해당하는 텍스트 추출
    harmful_texts = []
    for start, end in rationale_offsets:
        harmful_texts.append(text[start:end])  # 시작과 끝 위치로 원문에서 텍스트 추출

    return harmful_texts

# 예제 텍스트 및 라벨
sample_text = ds['train']['text'][0]
sample_rationale = ds['train']['offensiveness_rationale'][0]  # 예: [[8, 10], [11, 14], ...]

# 유해 텍스트 추출
harmful_texts = extract_rationale_from_offsets(sample_text, sample_rationale, tokenizer)

print("Original Text:", sample_text)
print("Offensiveness Rationale:", sample_rationale)
print("Harmful Texts:", harmful_texts)


Original Text: 하나도 모르는 얼라 쉭 끼가 설치는꼬라서니가 무릇 텅빈 백 정한테 칼자루 쥐여준 형국 민 좃 당 애 세들 속은 닐리리 맘보 통탄스럽구나 나라의 흥망성쇠 갈림길에 저런 등 신 들이 관여 한다는것이
Offensiveness Rationale: [[8, 10], [11, 14], [50, 51], [54, 57], [93, 96]]
Harmful Texts: ['얼라', '쉭 끼', '좃', '애 세', '등 신']


In [31]:
# 모델 초기화
model = ElectraForTokenClassification.from_pretrained(
    "beomi/KcELECTRA-base-v2022",
    num_labels=2  # 유해 여부: [0, 1]
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 손실 함수 및 Optimizer 설정
class_weights = torch.tensor([1.0, 8.7]).to(device) # 데이터 비율이 9:1이라 가중치
loss_fn = CrossEntropyLoss(weight=class_weights)

optimizer = AdamW(model.parameters(), lr=3e-5)
num_training_steps = len(train_loader) * 3
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=500, num_training_steps=num_training_steps
)

# 학습 함수
def train_one_epoch(model, data_loader, optimizer, lr_scheduler, device, loss_fn):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        total_loss += loss.item()
    return total_loss

# 학습 실행
for epoch in range(3):
    epoch_loss = train_one_epoch(model, train_loader, optimizer, lr_scheduler, device, loss_fn)
    print(f"Epoch {epoch + 1}, Loss: {epoch_loss:.4f}")


Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base-v2022 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


KeyboardInterrupt: 

In [19]:
from google.colab import drive
drive.mount('/content/drive')

save_path = "/content/drive/MyDrive/yaife/detector3/model_checkpoint"


model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print("Model saved.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved.


In [32]:
# 평가 함수 정의
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

def evaluate_model(model, data_loader, tokenizer, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=-1).view(-1).cpu().numpy()
            labels = labels.view(-1).cpu().numpy()

            active_preds = preds[labels != -100]
            active_labels = labels[labels != -100]

            all_preds.extend(active_preds)
            all_labels.extend(active_labels)

    # 메트릭 계산
    precision = precision_score(all_labels, all_preds, average="binary", zero_division=1)
    recall = recall_score(all_labels, all_preds, average="binary", zero_division=1)
    f1 = f1_score(all_labels, all_preds, average="binary", zero_division=1)
    print(classification_report(all_labels, all_preds))
    return precision, recall, f1

# 검증 데이터 평가
precision, recall, f1 = evaluate_model(model, validation_loader, tokenizer, device)
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")


              precision    recall  f1-score   support

           0       0.97      0.97      0.97   1429282
           1       0.75      0.73      0.74    160718

    accuracy                           0.95   1590000
   macro avg       0.86      0.85      0.86   1590000
weighted avg       0.95      0.95      0.95   1590000

Precision: 0.7487, Recall: 0.7296, F1-Score: 0.7391


In [33]:
def extract_negative_spans_from_text(text, model, tokenizer, device):
    """
    입력 텍스트에서 유해 span을 감지 및 추출하는 함수.

    Args:
    - text (str): 입력 텍스트
    - model: 학습된 Token Classification 모델
    - tokenizer: Tokenizer
    - device: 'cpu' 또는 'cuda'

    Returns:
    - harmful_spans (list): 유해한 span과 위치 정보 [(span_text, (start_idx, end_idx))]
    """
    # 1. 텍스트 전처리 및 토크나이즈
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        return_offsets_mapping=True
    ).to(device)
    offset_mapping = inputs.pop("offset_mapping")[0]  # Offset Mapping 저장

    # 2. 모델 예측
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().numpy()

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].squeeze().cpu().numpy())

    # 3. 후처리: 유해 span 추출
    harmful_spans = []
    current_span = ""
    current_offsets = None

    for token, pred, (start, end) in zip(tokens, predictions, offset_mapping.tolist()):
        if pred == 1 and token not in tokenizer.all_special_tokens:  # 유해 토큰 감지
            if token.startswith("##"):  # 서브워드 토큰
                current_span += token[2:]
                current_offsets = (current_offsets[0], end)
            else:
                if current_span:  # 이전 스팬 저장
                    harmful_spans.append((current_span, current_offsets))
                current_span = token
                current_offsets = (start, end)
        else:
            if current_span:  # 최종 스팬 저장
                harmful_spans.append((current_span, current_offsets))
                current_span = ""
                current_offsets = None

    if current_span:  # 남은 스팬 저장
        harmful_spans.append((current_span, current_offsets))

    return harmful_spans


In [34]:
input_text = "너는 정말 최악이야. 무식한 행동 좀 하지마."

harmful_spans = extract_negative_spans_from_text(input_text, model, tokenizer, device)

print("Input Text:", input_text)
print("Harmful Spans:", harmful_spans)


Input Text: 너는 정말 최악이야. 무식한 행동 좀 하지마.
Harmful Spans: [('최악', (6, 8)), ('무식한', (12, 15))]
