# 1. 준비

In [4]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import gluonnlp as nlp
import numpy as np
from tqdm import tqdm, tqdm_notebook
from kobert.utils import get_tokenizer
from kobert.pytorch_kobert import get_pytorch_kobert_model

bertmodel, vocab = get_pytorch_kobert_model()

using cached model. /content/drive/MyDrive/종합설계/.cache/kobert_v1.zip
using cached model. /content/drive/MyDrive/종합설계/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


# 2. 데이터 준비

In [5]:
import pandas as pd
data = pd.read_csv("training_dataset_ver3.csv")

data_list = list()
for sen, lab in zip(data["0"], data["1"]):
  data_list.append([sen,lab])

In [6]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(data_list, test_size=0.1, random_state=0) # train : test = 9:1
train_set[0]

['그 동안 보빨러들이 얼마나 잘 해줬겠어ㅋㅋㅋ', 2]

In [7]:
abuse = pd.read_csv("폭언사전.csv")
sexual = pd.read_csv("성희롱사전.csv")

In [8]:
abuse_list = list()
sexual_list = list()

for rows in abuse["0"]:
  abuse_list.append(rows)

for rows in sexual["0"]:
  sexual_list.append(rows)

In [9]:
abuse_list[:3], sexual_list[:3]

(['가난', '가난뱅이', '가두'], ['19금', '69', '69자세'])

# 3. 코버트 다중 분류기 로드

In [10]:
device = torch.device("cuda:0")

class BERTDataset(Dataset):
  def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, max_len, pad, pair):

    # sentence , label data를 BERT의 입력값에 맞게 변환하는 transformer를 생성
    transform = nlp.data.BERTSentenceTransform(bert_tokenizer, max_len, pad=pad, pair=pair)

    ## 생성한 transformer로 sentence를 변환하여 저장
    self.sentences = [transform([data[sent_idx]]) for data in dataset]
    self.labels = [np.int32(data[label_idx]) for data in dataset]
  
  def __getitem__ (self, i):
    return (self.sentences[i] + (self.labels[i], )) # 각 index에 맞는 item 반환 진행 --> 왜 이런 형태인지는 잘 모르겠음
  
  def __len__(self):
    return (len(self.labels))

# Parameter setting 진행
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

# Kobert 모듈에서 제공하는 get_tokenizer와 vocab를 활용해 tokneizer를 구성한다
tokenizer = get_tokenizer() 
tok = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

data_train = BERTDataset(train_set, 0, 1, tok, max_len, True, False)
data_test = BERTDataset(test_set, 0, 1, tok, max_len, True, False)

class BERTClassifier(nn.Module):
  def __init__(self, bert, hidden_size = 768, num_classes=3, dr_rate=None, params=None):
    super(BERTClassifier, self).__init__()
    self.bert = bert
    self.dr_rate = dr_rate

    ## classifier는 선형 회귀 모델로 구성 (input size = 768, output size = 3 (label이 3개로 구성))
    self.classifier = nn.Linear(hidden_size, num_classes)

    ## overfitting 방지를 위한 dropout 비율 설정
    if dr_rate:
      self.dropout = nn.Dropout(p=dr_rate)

  # attention mask sequence를 구성해주는 함수 --> padding이 아닌 영역을 0에서 1로 변경
  def gen_attention_mask(self, token_ids, valid_length):
    attention_mask = torch.zeros_like(token_ids)
    for i,v in enumerate(valid_length):
      attention_mask[i][:v] = 1
    
    return attention_mask.float()
  
  # bert + classifier를 관통하는 forward 연산 진행
  def forward(self, token_ids, valid_length, segment_ids):

    # attention_mask 계산
    attention_mask = self.gen_attention_mask(token_ids, valid_length)

    # bert에 input 투입, 변수명이 pooler인거 보니 출력 embedding에 mean pooling 적용한 값이지 않을까 추측
    _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))

    # dropout 비율이 존재한다면, dropout 적용
    if self.dr_rate:
        out = self.dropout(pooler)

    # classifier 진행
    return self.classifier(out) 

model = BERTClassifier(bertmodel, dr_rate=0.5).to(device)
model_state_dict = torch.load("kobert_classifier.pth", map_location=device)
model.load_state_dict(model_state_dict)

using cached model. /content/drive/MyDrive/종합설계/.cache/kobert_news_wiki_ko_cased-1087f8699e.spiece


<All keys matched successfully>

# 4. predict

In [11]:
from kiwipiepy import Kiwi
kiwi = Kiwi()

def predict(predict_sentence):
  # 0. 만약 성희롱, 욕설 사전에 걸리지 않으면, 바로 return 0 진행
  morphs = kiwi.tokenize(predict_sentence)
  flag = False
  for morph in morphs:
    if morph.form in sexual_list or morph.form in abuse_list:
      flag = True
      break
  
  if flag == False:
    return 0


  # 1. data set 구성 (문장, 라벨)
  data = [predict_sentence, '0']
  dataset_another = [data]

  # 2. data를 bert의 입력에 맞게 변환하기
  another_test = BERTDataset(dataset_another, 0, 1, tok, max_len, True, False)
  test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=0)
  
  model.eval()

  for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
      token_ids = token_ids.long().to(device)
      segment_ids = segment_ids.long().to(device)
      valid_length= valid_length
      label = label.long().to(device)

      # 모델 forward 연산 진행
      out = model(token_ids, valid_length, segment_ids)
      
      # torch out -> numpy 형식으로 변환
      logits = out[0].detach().cpu().numpy()

      return np.argmax(logits)

In [12]:
y_hat = list()

for rows in test_set:
  labels = predict(rows[0])
  y_hat.append(labels)

In [13]:
y = list()

for rows in test_set:
  y.append(rows[1])

In [14]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

cm = confusion_matrix(y, y_hat)
print("===Confusion Matrix===")
print(cm)

===Confusion Matrix===
[[1933   39   27]
 [  62  906   43]
 [  55   42  763]]


In [15]:
# 2. Accuracy
print("\n===Accuracy===")
print(accuracy_score(y,y_hat))

# 3. Precision
print("\n===Precision===")
print(precision_score(y ,y_hat, average='macro'))

# 4. Recall
print("\n===Recall===")
print(recall_score(y,y_hat, average='macro'))


# 5. F1
print("\n===F1===")
print(f1_score(y, y_hat, average='macro'))


===Accuracy===
0.930749354005168

===Precision===
0.9256087821740008

===Recall===
0.9167784091019585

===F1===
0.921023032425044
