# 사전 준비

**KcELECTRA 모델과 토크나이저 불러오기**

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel
  
tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
model = AutoModel.from_pretrained("beomi/KcELECTRA-base")

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/396k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraModel: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 텍스트 전처리

**전처리 함수 정의**

In [None]:
!pip install soynlp
!pip install emoji==1.7.0

In [None]:
import re
import emoji
from soynlp.normalizer import repeat_normalize

In [None]:
emojis = ''.join(emoji.UNICODE_EMOJI.keys())
pattern = re.compile(f'[^ .,?!/@$%~％·∼()\x00-\x7Fㄱ-ㅣ가-힣{emojis}]+')
url_pattern = re.compile(
    r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)')

In [None]:
def clean(x):
    x = pattern.sub(' ', x)         # 일반적으로 사용하는 특수문자, 영어, 한글, emoji제외 공백으로 치환
    x = url_pattern.sub('', x)      # URL 제거
    x = x.strip()                   # 문자의 시작과 끝에서 공백제거
    x = repeat_normalize(x, num_repeats=2)      # 반목되는 문자의 축약 횟수 2개로 줄임
    return x

**데이터 불러오기**

In [None]:
import numpy as np
import pandas as pd

In [None]:
curse_data = pd.read_table("/content/drive/Othercomputers/내 컴퓨터/Curse-words_Detection/Curse-detection-data/dataset.txt", names=["text", "label"], sep="|", header=None)
curse_data

Unnamed: 0,text,label
0,좌배 까는건 ㅇㅂ,1
1,집에 롱 패딩만 세 개다. 10년 더 입어야지 ㅋㅋ,0
2,개소리야 니가 빨갱이를 옹호하고 드루킹을 ㅇㅇ짓이라고 말못해서 삐진거야 빨갱아,1
3,세탁이라고 봐도 된다,0
4,애새끼가 초딩도 아니고 ㅋㅋㅋㅋ,1
...,...,...
5819,좌우 헬파이어 3개씩 6개 장착에 아파치보다 약하지만 20mm 기관포 장착임,0
5820,"세금 내놓으라고 데모질 중 ㅋㅋ간첩, 도둑놈 새끼들이 대통령 해처먹으니까 나도 같...",1
5821,너가 한 말 중에,0
5822,제갈대중 ㅇㅂ,0


**전처리 및 데이터 분할**

In [None]:
# train : validation : test = 5 : 1 : 2

train_text = [clean(curse_data['text'][idx]) for idx in range(0, int((curse_data.shape[0]/8)*5))]
val_text = [clean(curse_data['text'][idx]) for idx in range(int((curse_data.shape[0]/8)*5), int((curse_data.shape[0]/8)*6))]
test_text = [clean(curse_data['text'][idx]) for idx in range(int((curse_data.shape[0]/8)*6), int((curse_data.shape[0]/8)*8))]

In [None]:
train_label = [curse_data['label'][idx] for idx in range(0, int((curse_data.shape[0]/8)*5))]
val_label = [curse_data['label'][idx] for idx in range(int((curse_data.shape[0]/8)*5), int((curse_data.shape[0]/8)*6))]
test_label = [curse_data['label'][idx] for idx in range(int((curse_data.shape[0]/8)*6), int((curse_data.shape[0]/8)*8))]

In [None]:
print("전처리 전:", curse_data['text'][10])
print("전처리 후:", train_text[10])

전처리 전: ㅋㅋㅋㅋㅋ
전처리 후: ㅋㅋ


**토크나이징**

In [None]:
train_input_token = tokenizer(train_text, truncation=True, padding=True, max_length=256, return_tensors="pt")
val_input_token = tokenizer(val_text, truncation=True, padding=True, max_length=256, return_tensors="pt")
test_input_token = tokenizer(test_text, truncation=True, padding=True, max_length=256, return_tensors="pt")

# 데이터 구축

**데이터셋**

In [None]:
import torch

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.inputs.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = MyDataset(train_input_token, train_label)
val_dataset = MyDataset(val_input_token, val_label)
test_dataset = MyDataset(test_input_token, test_label)

**데이터로더**

In [None]:
from torch.utils.data import DataLoader

In [None]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

In [None]:
# 데이터로더 확인
next(iter(train_loader))

  import sys


{'input_ids': tensor([[    2, 21340,  1565,  ...,     0,     0,     0],
         [    2,  8136, 15029,  ...,     0,     0,     0],
         [    2, 16753, 10560,  ...,     0,     0,     0],
         ...,
         [    2,  9343, 21183,  ...,     0,     0,     0],
         [    2, 10041, 13360,  ...,     0,     0,     0],
         [    2,  8478,  4218,  ...,     0,     0,     0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 0, 1, 1, 0, 1, 0, 1])}

# 모델 학습

**모델 정의**

In [None]:
import torch.nn as nn

In [None]:
# ELECTRA를 포함한 신경망 모형
class ELECTRA_Model(torch.nn.Module):
    def __init__(self, pretrained_model, token_size, num_labels): 
        super(ELECTRA_Model, self).__init__()
        self.pretrained_model = pretrained_model
        self.token_size = token_size
        self.num_labels = num_labels

        # 분류기 정의
        self.classifier = torch.nn.Linear(self.token_size, self.num_labels)

    def forward(self, inputs):
        # 모형에 입력을 넣고 출력을 받음
        outputs = self.pretrained_model(**inputs)
        # 출력에서 CLS 토큰에 해당하는 부분만 가져옴
        clf_token = outputs.last_hidden_state[:,0,:]
        
        return self.classifier(clf_token)

In [None]:
electra = ELECTRA_Model(model, num_labels=2, token_size=model.config.hidden_size)

**모델 파라미터 설정**

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
# GPU 가속을 사용할 수 있으면 device를 cuda로 설정하고, 아니면 cpu로 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_epochs = 5
learning_rate = 1e-5

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# 학습 스케줄러 설정
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_training_steps=len(train_loader),
                                            num_warmup_steps=0)

step = 0
eval_steps = 455        # 훈련 배치수

**학습 진행**