<a href="https://colab.research.google.com/github/CAU-CAPSTONE-2-Friendly-Keyboard/Models/blob/main/HateSpeech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 설치

In [1]:
!pip install transformers
!pip install datasets==1.17.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Using cached transformers-4.27.3-py3-none-any.whl (6.8 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.11.0
  Using cached huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets==1.17.0
  Downloading datasets-1.17.0-py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.3/306.3 KB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

# Import

In [2]:
import math
from datasets import load_dataset
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer,BertForSequenceClassification, TrainingArguments, Trainer,DataCollatorWithPadding
from sklearn.metrics import label_ranking_average_precision_score, classification_report
import tqdm
from transformers.pipelines.base import KeyDataset

# Data Load

In [3]:
dataset = load_dataset('smilegate-ai/kor_unsmile')

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]



Downloading and preparing dataset None/None (download: 1.39 MiB, generated: 4.93 MiB, post-processed: Unknown size, total: 6.32 MiB) to /root/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/290k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/parquet/smilegate-ai--kor_unsmile-e0f75c6e3be1af78/0.0.0/1638526fd0e8d960534e2155dc54fdff8dce73851f21f031d2fb9c2cf757c121. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    valid: Dataset({
        features: ['문장', '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭', 'labels'],
        num_rows: 3737
    })
    train: Dataset({
        features: ['문장', '여성/가족', '남성', '성소수자', '인종/국적', '연령', '지역', '종교', '기타 혐오', '악플/욕설', 'clean', '개인지칭', 'labels'],
        num_rows: 15005
    })
})

In [5]:
dataset["train"][0]

{'문장': '일안하는 시간은 쉬고싶어서 그런게 아닐까',
 '여성/가족': 0,
 '남성': 0,
 '성소수자': 0,
 '인종/국적': 0,
 '연령': 0,
 '지역': 0,
 '종교': 0,
 '기타 혐오': 0,
 '악플/욕설': 0,
 'clean': 1,
 '개인지칭': 0,
 'labels': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1]}

In [6]:
unsmile_labels = ["여성/가족","남성","성소수자","인종/국적","연령","지역","종교","기타 혐오","악플/욕설","clean"]

# Model

## Tokenizer : SentencePiece

In [7]:
model_name = 'beomi/kcbert-base'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/250k [00:00<?, ?B/s]

In [9]:
tokenizer(dataset["train"][0]['문장'])

{'input_ids': [2, 2458, 15751, 24930, 24351, 29278, 17038, 11631, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [10]:
def preprocessing(data):
  tokenized_sent = tokenizer(data["문장"])
  tokenized_sent['labels'] = torch.tensor(data['labels'],dtype=torch.float)
  return tokenized_sent

In [11]:
tokenized_dataset = dataset.map(preprocessing)
tokenized_dataset.set_format(type='torch',columns=['input_ids', 'labels', 'attention_mask', 'token_type_ids'])



  0%|          | 0/3737 [00:00<?, ?ex/s]

  0%|          | 0/15005 [00:00<?, ?ex/s]

In [12]:
tokenized_dataset['train'][0]

{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]),
 'input_ids': tensor([    2,  2458, 15751, 24930, 24351, 29278, 17038, 11631,     3]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1])}

## Data Collator

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
num_labels=len(unsmile_labels) # Label 갯수

model = BertForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=num_labels, 
    problem_type="multi_label_classification"
)
model.config.id2label = {i: label for i, label in zip(range(num_labels), unsmile_labels)}
model.config.label2id = {label: i for i, label in zip(range(num_labels), unsmile_labels)}

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [15]:
model.config.label2id

{'여성/가족': 0,
 '남성': 1,
 '성소수자': 2,
 '인종/국적': 3,
 '연령': 4,
 '지역': 5,
 '종교': 6,
 '기타 혐오': 7,
 '악플/욕설': 8,
 'clean': 9}

# Training

In [16]:
def compute_metrics(x):
  return {
      'lrap': label_ranking_average_precision_score(x.label_ids, x.predictions)
  }

In [17]:
batch_size = 32
lr = 1e-5

In [18]:
args = TrainingArguments(
    output_dir="model_output",
    evaluation_strategy="epoch",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='lrap',
    greater_is_better=True,
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=tokenized_dataset["train"], 
    eval_dataset=tokenized_dataset["valid"], 
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Lrap
1,No log,0.153301,0.856528


In [None]:
trainer.save_model()

# 실행

In [None]:
from transformers import TextClassificationPipeline

pipe = TextClassificationPipeline(
    model = model,
    tokenizer = tokenizer,
    device=0,
    return_all_scores=True,
    function_to_apply='sigmoid'
    )

In [None]:
test_text = ""

In [None]:
for result in pipe(test_text)[0]:
    print(result)

# Evaluation

In [None]:
def get_predicated_label(output_labels, min_score):
    labels = []
    for label in output_labels:
        if label['score'] > min_score:
            labels.append(1)
        else:
            labels.append(0)
    return labels

In [None]:
predicated_labels = []

for out in tqdm.tqdm(pipe(KeyDataset(dataset['valid'], '문장'))):
    predicated_labels.append(get_predicated_label(out, 0.5))

In [None]:
print(classification_report(dataset['valid']['labels'], predicated_labels))