In [2]:
!git clone https://github.com/ZIZUN/korean-malicious-comments-dataset.git
import pandas as pd
import matplotlib.pyplot as plt

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
# GPU ÏÑ§Ï†ï
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print("device:", device)

Cloning into 'korean-malicious-comments-dataset'...


device: cpu


In [3]:
df = pd.read_csv("./dataset.csv", encoding = 'cp949')
df.head()

Unnamed: 0,text,curse
0,Ï¢åÎ∞∞ ÍπåÎäîÍ±¥ „Öá„ÖÇ,1
1,ÏßëÏóê Î°± Ìå®Îî©Îßå ÏÑ∏ Í∞úÎã§. 10ÎÖÑ Îçî ÏûÖÏñ¥ÏïºÏßÄ „Öã„Öã,0
2,Í∞úÏÜåÎ¶¨Ïïº ÎãàÍ∞Ä Îπ®Í∞±Ïù¥Î•º ÏòπÌò∏ÌïòÍ≥† ÎìúÎ£®ÌÇπÏùÑ „Öá„ÖáÏßìÏù¥ÎùºÍ≥† ÎßêÎ™ªÌï¥ÏÑú ÏÇêÏßÑÍ±∞Ïïº Îπ®Í∞±ÏïÑ,1
3,ÏÑ∏ÌÉÅÏù¥ÎùºÍ≥† Î¥êÎèÑ ÎêúÎã§,0
4,Ïï†ÏÉàÎÅºÍ∞Ä Ï¥àÎî©ÎèÑ ÏïÑÎãàÍ≥† „Öã„Öã„Öã„Öã,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5824 entries, 0 to 5823
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5824 non-null   object
 1   curse   5824 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 91.1+ KB


In [5]:
train_data = df.sample(frac = 0.8, random_state = 42)
test_data = df.drop(train_data.index)

In [6]:
MODEL_NAME = "beomi/KcELECTRA-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [7]:
tokenized_train_sentences = tokenizer(
    list(train_data["text"]),
    return_tensors="pt",                # pytorchÏùò tensor ÌòïÌÉúÎ°ú return
    max_length=128,                     # ÏµúÎåÄ ÌÜ†ÌÅ∞Í∏∏Ïù¥ ÏÑ§Ï†ï
    padding=True,                       # Ï†úÎ°úÌå®Îî© ÏÑ§Ï†ï
    truncation=True,                    # max_length Ï¥àÍ≥º ÌÜ†ÌÅ∞ truncate
    add_special_tokens=True,            # special token Ï∂îÍ∞Ä
    )

In [8]:
print(tokenized_train_sentences[0])
print(tokenized_train_sentences[0].tokens)
print(tokenized_train_sentences[0].ids)
print(tokenized_train_sentences[0].attention_mask)

Encoding(num_tokens=128, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['[CLS]', 'Ìä∏ÎüºÌîÑ', 'Ï°¥ÎÇò', '##Ï†ä', '##Ïñ¥', '##Î≥¥Ïù¥', '##ÎÑ§', 'ÎÇò', '##ÌôÄ', '##Î°ú', '##ÏßëÏóê', '##2', 'ÎÇòÏò§', '##Îçò', '##Îïå', '##ÎÉê', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [9]:
tokenized_test_sentences = tokenizer(
    list(test_data["text"]),
    return_tensors="pt",
    max_length=128,
    padding=True,
    truncation=True,
    add_special_tokens=True,
    )

In [10]:
class CurseDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [11]:
train_label = train_data["curse"].values
test_label = test_data["curse"].values

train_dataset = CurseDataset(tokenized_train_sentences, train_label)
test_dataset = CurseDataset(tokenized_test_sentences, test_label)

In [12]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.out_proj.bias', 'classifier

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(50135, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0): ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [13]:
training_args = TrainingArguments(
    output_dir='./',                    # ÌïôÏäµÍ≤∞Í≥º Ï†ÄÏû•Í≤ΩÎ°ú
    num_train_epochs=10,                # ÌïôÏäµ epoch ÏÑ§Ï†ï
    per_device_train_batch_size=8,      # train batch_size ÏÑ§Ï†ï
    per_device_eval_batch_size=64,      # test batch_size ÏÑ§Ï†ï
    logging_dir='./logs',               # ÌïôÏäµlog Ï†ÄÏû•Í≤ΩÎ°ú
    logging_steps=500,                  # ÌïôÏäµlog Í∏∞Î°ù Îã®ÏúÑ
    save_total_limit=2,                 # ÌïôÏäµÍ≤∞Í≥º Ï†ÄÏû• ÏµúÎåÄÍ∞ØÏàò 
)

In [14]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [15]:
trainer = Trainer(
    model=model,                         # ÌïôÏäµÌïòÍ≥†ÏûêÌïòÎäî ü§ó Transformers model
    args=training_args,                  # ÏúÑÏóêÏÑú Ï†ïÏùòÌïú Training Arguments
    train_dataset=train_dataset,         # ÌïôÏäµ Îç∞Ïù¥ÌÑ∞ÏÖã
    eval_dataset=test_dataset,           # ÌèâÍ∞Ä Îç∞Ïù¥ÌÑ∞ÏÖã
    compute_metrics=compute_metrics,     # ÌèâÍ∞ÄÏßÄÌëú
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 4659
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5830


  0%|          | 0/5830 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


KeyboardInterrupt: 

In [None]:
torch.save(model, "./model.pt")
trainer.evaluate(eval_dataset = test_data)

NameError: name 'torch' is not defined

In [None]:
trainer.evaluate(eval_dataset = test_dataset)

***** Running Evaluation *****
  Num examples = 1165
  Batch size = 64
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.7572466731071472,
 'eval_accuracy': 0.9030042918454936,
 'eval_f1': 0.8636911942098914,
 'eval_precision': 0.8364485981308412,
 'eval_recall': 0.8927680798004988,
 'eval_runtime': 136.9388,
 'eval_samples_per_second': 8.507,
 'eval_steps_per_second': 0.139,
 'epoch': 10.0}

In [1]:
def sentence_predict(sent):
    # ÌèâÍ∞ÄÎ™®ÎìúÎ°ú Î≥ÄÍ≤Ω
    model.eval()

    # ÏûÖÎ†•Îêú Î¨∏Ïû• ÌÜ†ÌÅ¨ÎÇòÏù¥Ïßï
    tokenized_sent = tokenizer(
        sent,
        return_tensors="pt",
        truncation=True,
        add_special_tokens=True,
        max_length=128
    )
    
    # Î™®Îç∏Ïù¥ ÏúÑÏπòÌïú GPUÎ°ú Ïù¥Îèô 
    tokenized_sent.to(device)

    # ÏòàÏ∏°
    with torch.no_grad():
        outputs = model(
            input_ids=tokenized_sent["input_ids"],
            attention_mask=tokenized_sent["attention_mask"],
            token_type_ids=tokenized_sent["token_type_ids"]
            )

    # Í≤∞Í≥º return
    logits = outputs[0]
    logits = logits.detach().cpu()
    result = logits.argmax(-1)
    if result == 1:
        result = " >> ÏïÖÏÑ±ÎåìÍ∏Ä üëø"
    elif result == 0:
        result = " >> Ï†ïÏÉÅÎåìÍ∏Ä üòÄ"
    return result
#0 ÏûÖÎ†•Ïãú Ï¢ÖÎ£å
while True: 
    sentence = input("ÎåìÍ∏ÄÏùÑ ÏûÖÎ†•Ìï¥Ï£ºÏÑ∏Ïöî: ")
    if sentence == "0":
        break
    print(sentence_predict(sentence), sentence)
    print("\n")

NameError: name 'model' is not defined