# 설치

In [1]:
! pip install Korpora
! pip install tokenizers
! pip install transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# dataset download(청와대 국민청원)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from Korpora import Korpora
corpus = Korpora.load("korean_petitions")


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : Hyunjoong Kim lovit@github
    Repository : https://github.com/lovit/petitions_archive
    References :

    청와대 국민청원 게시판의 데이터를 월별로 수집한 것입니다.
    청원은 게시판에 글을 올린 뒤, 한달 간 청원이 진행됩니다.
    수집되는 데이터는 청원종료가 된 이후의 데이터이며, 청원 내 댓글은 수집되지 않습니다.
    단 청원의 동의 개수는 수집됩니다.
    자세한 내용은 위의 repository를 참고하세요.

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `korean_petitions` is already installed at /root/Korpora/korean_petitions/petitions_2017-08
[Korpora] Corpus `korean_petitions` is already installed at /root/Korpora/korean_petitions/petitions_2017-09
[Korpora] Corpus `korean_petitions` is already installed at /root/Korpora/kore

##### text 전처리

In [4]:
import re

def clean_text(text):
  filter = r'[^가-힣0-9a-zA-Z.,?!/ ]'
  text = text.replace('?','.')
  text = text.replace('!','.')
  return re.sub(filter, '', text)

In [5]:
len(corpus.train)

433631

In [6]:
def length_limit(lists):
  new_list = []
  for sequence in lists:
    if len(sequence) > 3:
      new_list.append(sequence)
  return new_list

##### next sentence prediction (sentence pair, labels)

In [7]:
def nsp_datasets(corpus):
  sentence_pair = []
  is_next = []
  for corpus_index in range(len(corpus.train)):
    sentence_list = clean_text(corpus.train[corpus_index].text).split('. ')
    sequence_list = length_limit(sentence_list)
    for sequence_index in range(len(sequence_list)):
      if corpus_index <= int(len(corpus.train)/2): # is_next = True
        try: 
          pair = sequence_list[2*sequence_index] + ' [SEP] ' + sequence_list[2*sequence_index+1]
          sentence_pair.append(pair)
          is_next.append(0)
        except:
          continue

      else: # is_next = False
        try:
          pair = sequence_list[2*sequence_index+1] + ' [SEP] ' + sequence_list[2*sequence_index] # 구현의 편의를 위해 순서만 바꿈(SOP loss - albert)
          sentence_pair.append(pair)
          is_next.append(1)
        except:
          continue
  return sentence_pair, is_next


In [8]:
import pandas as pd

s_pair, is_n = nsp_datasets(corpus)
data = pd.DataFrame()
data['sentence_pair'] = s_pair
data['is_next'] = is_n
data = data.sample(frac=1).reset_index(drop=True)
data.head(10)

Unnamed: 0,sentence_pair,is_next
0,정말로 도로침하 때문에 안전적으로 위험해서 전력선포설공사를 못하시게 하는 것이 맞나...,0
1,"공무원이 국가, 국민돈을 먹튀했다면 그 값에서 1015의 범죄부가세를 도입하면 됩...",1
2,"안녕하십니까, 2019학년도 수능 응시 예정인 고3 학생입니다 [SEP] 이 글은 ...",0
3,11 더 자세한 것은 대한민국의 여성인권단체/목록의 목차 중 한국여성민우회 항목 참...,1
4,예수의 원수사랑은 오지 로마에 한정되었을 따름이었다 [SEP] 예수가 원수를 사랑하...,1
5,"국민혈세 마구 써제끼며, 선심쓰는 현정권도 파면해야 합니까 [SEP] 파면해야 할 ...",0
6,"저분 원금이 얼마셨는지는 모르겠지만, 20년동안 24억이라는 세금을 내시고, 36억...",1
7,국방부장관 또는 행정안전부장관은 제2항 또는 제3항에 해당하는 사유가 발생한 경우...,1
8,그리고 주인은 국민들을 위해 부품공장하나도 허용하지 않는다 [SEP] 주총에서 반대한다,0
9,감사합니다. [SEP] 모두가 함께 잘 살 수 있기를 바랍니다,1


In [9]:
data.is_next.value_counts()

1    863655
0    803515
Name: is_next, dtype: int64

## wordpiece tokenizer

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/데이터사이언스 특론/실습/wiki.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(data['sentence_pair']))

In [None]:
from tokenizers import BertWordPieceTokenizer

data_file = '/content/drive/MyDrive/Colab Notebooks/데이터사이언스 특론/실습/wiki.txt'
vocab_size = 30000
limit_alphabet = 6000 # 합 전의 초기 토큰의 허용 개수
min_frequency = 5

tokenizer = BertWordPieceTokenizer(
    strip_accents=False, # True일 경우 악센트 제거.
    lowercase=False, # 대소문자를 구분 여부 True일 경우 구분 x
    wordpieces_prefix="##"
)

tokenizer.train(files=data_file,
                vocab_size=vocab_size,
                limit_alphabet=limit_alphabet,
                min_frequency=min_frequency)

In [None]:
import numpy as np

vocab_dir = '/content/drive/MyDrive/Colab Notebooks/데이터사이언스 특론/실습/vocab.txt'

# tokenizer에서 vocab 가져오기
vocab = tokenizer.get_vocab()

# index 번호에 맞게 정렬
vocabulary = [[v, k] for k, v in vocab.items()]
vocabulary = sorted(vocabulary)
vocabulary = list(np.array(vocabulary)[:, 1])

# vocabulary 저장
with open(vocab_dir, 'w+') as lf:
    lf.write('\n'.join(vocabulary))

In [10]:
from transformers import BertTokenizer

wordpiece_tokenizer = BertTokenizer.from_pretrained('/content/drive/MyDrive/Colab Notebooks/데이터사이언스 특론/실습/vocab.txt', do_basic_tokenize=False)



In [11]:
len(wordpiece_tokenizer.vocab)

30000

##### tokenizer.encode(text)'s attributes
* tokenizer.encode - index id
* tokenizer.tokenize - splitted text tokens 

In [None]:
wordpiece_tokenizer.encode(data['sentence_pair'].iloc[2])

In [None]:
wordpiece_tokenizer.tokenize(data['sentence_pair'].iloc[2])

# Inputs

##### BERT's inputs
* index tokens
* segments
* positions

In [11]:
# index id
def index_tokens(sentence, tokenizer):
  return tokenizer.encode(sentence)

In [12]:
# segment id 
def segment_tokens(token_list, max_length):
  segment = []
  first_length = token_list.index(3)
  second_length = max_length - first_length
  segment.extend([0]*first_length)
  segment.extend([1]*second_length)
  return segment

In [13]:
# position id
def position_tokens(token_list, max_length):
  position = [i for i in range(max_length)]
  return position

In [14]:
data['token_id'] = data.sentence_pair.apply(lambda x: index_tokens(x, wordpiece_tokenizer))
data['length'] = data.token_id.apply(lambda x: len(x))
# data['segment'] = data.token_id.apply(segment_tokens)
# data['position'] = data.token_id.apply(position_tokens)
data.head()

Unnamed: 0,sentence_pair,is_next,token_id,length
0,정말로 도로침하 때문에 안전적으로 위험해서 전력선포설공사를 못하시게 하는 것이 맞나...,0,"[2, 10322, 7769, 3844, 3621, 6955, 7088, 6796,...",39
1,"공무원이 국가, 국민돈을 먹튀했다면 그 값에서 1015의 범죄부가세를 도입하면 됩...",1,"[2, 9698, 1, 6770, 10731, 1287, 4657, 12600, 2...",45
2,"안녕하십니까, 2019학년도 수능 응시 예정인 고3 학생입니다 [SEP] 이 글은 ...",0,"[2, 1, 16689, 8170, 3620, 8619, 12244, 10958, ...",29
3,11 더 자세한 것은 대한민국의 여성인권단체/목록의 목차 중 한국여성민우회 항목 참...,1,"[2, 7159, 745, 15061, 6889, 7319, 1, 1331, 369...",47
4,예수의 원수사랑은 오지 로마에 한정되었을 따름이었다 [SEP] 예수가 원수를 사랑하...,1,"[2, 25741, 3694, 16724, 18703, 3685, 13895, 11...",42


##### masked language model

In [15]:
import math
import random
from copy import deepcopy

def masked_LM(token_list, max_length):
  token_list_copy = deepcopy(token_list)
  total_length = len(token_list) 
  num_mask = int(total_length*0.15) 
  len_vocab = 30000

  # 전체 길이 중 15%의 토큰 선정
  masked_index = []
  while True:
    idx = random.randint(1,total_length-1) # [CLS], [SEP] 제외
    if (idx not in masked_index) and token_list[idx] != 3:
      masked_index.append(idx)
    if len(masked_index) == num_mask:
      break
  masked_index = sorted(masked_index)
  masked_index_copy = deepcopy(masked_index)
  num_random = int(round(num_mask*0.1))

  # 15% 중 80%를 masked 토큰으로
  num_masked = int(round(num_mask*0.8)) 
  masked_tokens = random.sample(masked_index_copy, num_masked) 
  for tokens in masked_tokens:
    token_list_copy[tokens] = 4
    masked_index_copy.remove(tokens)

  # 15% 중 10%를 random 토큰으로
  num_random = int(math.ceil(len(masked_index_copy)/2)) # 남은 토큰의 절반
  random_tokens = random.sample(masked_index_copy, num_random) 
  for tokens in random_tokens:
    token_list_copy[tokens] = random.randint(5, len_vocab) # special token 제외

  masked_label = []
  for i in range(len(token_list)):
    if i in masked_index:
      masked_label.append(token_list[i])
    else:
      masked_label.append(-1)

  # padding
  masked_label_output = masked_label + [-1]*(max_length - len(masked_label))
  token_list_copy_output = token_list_copy + [0]*(max_length - len(token_list_copy))

  return [token_list_copy_output, masked_label_output]

In [16]:
from torch.utils.data import Dataset

class BertDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
      self.bert_data = data.loc[(data.length >= 15) & (data.length <= max_length)]
      self.token_ids = self.bert_data.token_id.to_list()
      self.segment_ids = [segment_tokens(segment_token, max_length) for segment_token in self.token_ids]
      self.position_ids = [position_tokens(position_token, max_length) for position_token in self.token_ids]
      self.masekd_token_id_label = [masked_LM(t, max_length) for t in self.token_ids]
#      self.label_masked = [masked_LM(m, max_length)[1] for m in self.token_ids]
      self.is_next_label = self.bert_data.is_next.to_list()

    def __len__(self):
      return len(self.token_ids)

    def __getitem__(self, idx):
      return [self.token_ids[idx],   # 실제 token id
              self.segment_ids[idx],    # segment id(문장 구별)
              self.position_ids[idx],   # 위치 값
              self.masekd_token_id_label[idx][0],    # masking 적용된 token id
              self.masekd_token_id_label[idx][1],    # mlm task label
              [self.is_next_label[idx]]    # nsp task label
              ]


In [17]:
bert_data = BertDataset(data, wordpiece_tokenizer, 150)

### dataloader

In [18]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
import torch

def collate_fn(batch):
  segment_ids = torch.tensor([b[1] for b in batch])
  position_ids =torch.tensor([b[2] for b in batch])
  token_id_masked = torch.tensor([b[3] for b in batch])
  label_masked = torch.tensor([b[4] for b in batch])
  is_next_label = torch.tensor([b[5] for b in batch])
  return  segment_ids, position_ids, token_id_masked, label_masked, is_next_label

input_dataloader = DataLoader(bert_data, collate_fn=collate_fn, batch_size=64)

In [19]:
next(iter(input_dataloader))[0].size()

torch.Size([64, 150])

# Model

## scaled dot product


In [20]:
from torch import nn

In [21]:
class scaled_dot_product(nn.Module):
  def __init__(self):
    super(scaled_dot_product, self).__init__()
    self.softmax = nn.Softmax(dim=-1)

  def forward(self, query, key, value, mask, masked):
    key_tr = torch.transpose(key, 2, 3).contiguous()
    dk = torch.tensor(query.size()[-1])
    outputs = (query @ key_tr) / dk.sqrt()

    if mask is not None:
      outputs = outputs.masked_fill(mask == 0, -2**30)

    if masked:
      mask1 = torch.ones_like(outputs[:,:,:])
      mask1 = torch.triu(mask1, diagonal=1)
      mask1 = mask1*(-2**30)
      padding = torch.ones_like(mask1)
      padding = torch.tril(padding)
      outputs = torch.where(padding==1, outputs, mask1)

    attention_map = self.softmax(outputs)
    scaled_dp = (attention_map @ value)
    return scaled_dp, attention_map

## multihead attention

In [22]:
class multihead_attention(nn.Module):
  def __init__(self, embedding_dims, model_dims, num_heads):
    super(multihead_attention, self).__init__()

    self.num_heads = num_heads
    self.linear_q = nn.Linear(embedding_dims, model_dims)
    self.linear_k = nn.Linear(embedding_dims, model_dims)
    self.linear_v = nn.Linear(embedding_dims, model_dims)
    self.linear_output = nn.Linear(model_dims, model_dims)
    self.attention = scaled_dot_product()

  def split(self, tensor):
    batch_size, sequence_length, d_model = tensor.size()

    d_tensor = d_model // self.num_heads
    tensor = tensor.view(batch_size, sequence_length, self.num_heads, d_tensor).transpose(1, 2).contiguous()
    return tensor

  def cat(self, tensor):
    batch_size, self.num_heads, sequence_length, d_tensor = tensor.size()
    d_model = self.num_heads * d_tensor

    tensor = tensor.transpose(1, 2).contiguous().view(batch_size, sequence_length, d_model).contiguous()
    return tensor

  def forward(self, query, key, value, mask, masked):
    query1 = self.linear_q(query)
    key1 = self.linear_k(key)
    value1 = self.linear_v(value)

    query_split = self.split(query1)
    key_split = self.split(key1)
    value_split = self.split(value1)

    multihead_attn, attn_map = self.attention(query_split, key_split, value_split, mask, masked)
    attn_outputs1 = self.cat(multihead_attn)
    attn_outputs2 = self.linear_output(attn_outputs1)
    return attn_outputs2

## feed forward 

In [23]:
class feed_forward(nn.Module):
  def __init__(self, model_dims, ff_dims):
    super(feed_forward, self).__init__()
    self.dropout_p = 0.1
    self.linear_inner = nn.Linear(model_dims, ff_dims)
    self.linear_outer = nn.Linear(ff_dims, model_dims)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(p=self.dropout_p)

  def forward(self, input):
    output1 = self.linear_inner(input)
    output2 = self.relu(output1)
    output3 = self.dropout(output2)
    output4 = self.linear_outer(output3)
    return output4


## layer norm

In [24]:
class layer_norm(nn.Module):
  def __init__(self, model_dims, eps=1e-6):
    super(layer_norm, self).__init__()
    self.layernorm = nn.LayerNorm(model_dims)

  def forward(self, input):
    output = self.layernorm(input)
    return output

## sublayer connection

In [25]:
class sublayer_connection(nn.Module):
  def __init__(self, model_dims, dropout_p):
    super(sublayer_connection, self).__init__()
    self.layernorm = layer_norm(model_dims)
    self.dropout = nn.Dropout(dropout_p)

  def forward(self, input, output):
    outputs = self.layernorm(input + self.dropout(output))
    return outputs

## encoder module

In [26]:
class encoder_module(nn.Module):
  def __init__(self, model_dims, ff_dims, dropout_p, embedding_dims, num_heads):
    super(encoder_module, self).__init__()
    self.multiheadattention_en = multihead_attention(embedding_dims, model_dims, num_heads)
    self.feedforward = feed_forward(model_dims, ff_dims)
    self.sublayerconnection1 = sublayer_connection(model_dims, dropout_p)
    self.sublayerconnection2 = sublayer_connection(model_dims, dropout_p)

  def forward(self, encoder_inputs, mask):
    multihead_a = self.sublayerconnection1(encoder_inputs, self.multiheadattention_en(encoder_inputs, encoder_inputs, encoder_inputs, mask, masked=False))
    outputs = self.sublayerconnection2(multihead_a, self.feedforward(multihead_a))
    return outputs

## encoder

In [27]:
class encoder(nn.Module):
  def __init__(self, model_dims, ff_dims, num_heads, num_layers, embedding_dims, len_vocab, dropout_p, max_length):
    super(encoder, self).__init__()
    self.embedding_tokens = nn.Embedding(len_vocab+1, embedding_dims)
    self.embedding_segments = nn.Embedding(2, embedding_dims)
    self.embedding_position = nn.Embedding(max_length, embedding_dims)
    self.encoder_layer = nn.ModuleList([encoder_module(model_dims = model_dims,
                                                       ff_dims = ff_dims,
                                                       num_heads = num_heads,
                                                       dropout_p=dropout_p,
                                                       embedding_dims = embedding_dims)
                                        for _ in range(num_layers)])
    

  def forward(self, input_tokens, segment_tokens, position_tokens, mask):
    encoder_inputs = self.embedding_tokens(input_tokens) + self.embedding_segments(segment_tokens) + self.embedding_position(position_tokens)
    outputs = encoder_inputs
    for layer in self.encoder_layer:
      outputs = layer(outputs, mask)
    return outputs
    

## MLM layer

In [28]:
class mlm_layer(nn.Module):
  def __init__(self, model_dims, len_vocab):
    super(mlm_layer, self).__init__()
    self.layer1 = nn.Linear(model_dims, model_dims)
    self.layer2 = nn.Linear(model_dims, len_vocab)
    self.layernorm = layer_norm(model_dims)
    self.relu = nn.ReLU()

  def forward(self, inputs):
    outputs = self.layer1(inputs)
    outputs = self.relu(outputs)
    outputs = self.layernorm(outputs)
    outputs = self.layer2(outputs)
    return outputs

## BERT

In [29]:
class BERT(nn.Module):
  def __init__(self, model_dims, ff_dims, num_heads, num_layers, embedding_dims, len_vocab, dropout_p, src_pad_idx, max_length):
    super(BERT, self).__init__()
    self.src_pad_idx = src_pad_idx
    self.encoder = encoder(model_dims, ff_dims, num_heads, num_layers, embedding_dims, len_vocab, dropout_p, max_length)
    self.mlmlayer = mlm_layer(model_dims, len_vocab)
    self.nsp_layer = nn.Linear(model_dims, 2)
    self.activation = torch.tanh

  def forward(self, input_tokens, segment_tokens, position_tokens):
    encoder_mask = self.make_pad_mask(input_tokens, input_tokens)
    encoder_outputs = self.encoder(input_tokens, segment_tokens, position_tokens, encoder_mask)
    cls = self.activation(self.nsp_layer(encoder_outputs[:,1,:].contiguous()))
    mlm = self.mlmlayer(encoder_outputs)
    return cls, mlm

  def make_pad_mask(self,q,k):
    len_q,len_k = q.size(1),k.size(1)
    k = k.ne(self.src_pad_idx).unsqueeze(1).unsqueeze(2)
    k = k.repeat(1,1,len_q,1)
    q = q.ne(self.src_pad_idx).unsqueeze(1).unsqueeze(3)
    q = q.repeat(1,1,1,len_k)
    mask = k & q
    return mask


# training

In [30]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [31]:
# model_dims, ff_dims, num_heads, num_layers, embedding_dims, len_vocab, dropout_p, src_pad_idx, max_length
bert_model = BERT(150, 150, 3, 3, 150, 30000, 0.1, 0, 150).to(device)

In [32]:
epochs = 5
lr = 1e-4
optimizer = torch.optim.Adam(bert_model.parameters(), lr=lr, weight_decay=0.01)
criterion_mlm = nn.CrossEntropyLoss(ignore_index=-1)
criterion_cls = nn.CrossEntropyLoss()

In [33]:
def train(model, optimizer, criterion1, criterion2, dataloader):
  losses = [] 
  for i, data in enumerate(dataloader):
    optimizer.zero_grad()

    segment_ids, position_ids, token_id_masked, label_masked, is_next_label = data

    segment_ids = segment_ids.to(device)
    position_ids = position_ids.to(device)
    token_id_masked = token_id_masked.to(device)
    label_masked = label_masked.to(device)
    is_next_label = is_next_label.to(device)

    cls_logits, mlm_logits = model(token_id_masked, segment_ids, position_ids)
    loss_cls = criterion1(cls_logits, is_next_label.squeeze())
    loss_mlm = criterion2(mlm_logits.transpose(1, 2), label_masked)
    loss = loss_cls + loss_mlm
    losses.append(loss.item())
    
    loss.backward()
    optimizer.step()
    if i % 100 == 0:
      print('batch = ', i, ' loss = ', sum(losses) / len(losses))
  return sum(losses) / len(losses)



In [34]:
for i in range(epochs):
  avg_loss = train(bert_model, optimizer, criterion_cls, criterion_mlm, input_dataloader)
  print('epoch = ', i, ' avg_loss = ', avg_loss)

batch =  0  loss =  11.16948127746582
batch =  100  loss =  10.977680952242105
batch =  200  loss =  10.80120294959984
batch =  300  loss =  10.650647093687343
batch =  400  loss =  10.527114309277618
batch =  500  loss =  10.430378525556918
batch =  600  loss =  10.352009100445892
batch =  700  loss =  10.288632622799078
batch =  800  loss =  10.236796348133636
batch =  900  loss =  10.194970320385119
batch =  1000  loss =  10.158831773580728
batch =  1100  loss =  10.128700897327668
batch =  1200  loss =  10.103612844989659
batch =  1300  loss =  10.080373827811849
batch =  1400  loss =  10.060018546235808
batch =  1500  loss =  10.043376201474612
batch =  1600  loss =  10.02808511115699
batch =  1700  loss =  10.014936727471103
batch =  1800  loss =  10.003332691415027
batch =  1900  loss =  9.992277964110878
batch =  2000  loss =  9.982355090155117
batch =  2100  loss =  9.973446166498329
batch =  2200  loss =  9.964543417983032
batch =  2300  loss =  9.956702862755726
batch =  240

KeyboardInterrupt: ignored