## BERT 한국어 모델

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("klue/bert-base")



In [11]:
text = "나는 내일 야구를 관람할 예정입니다."

words = tokenizer.tokenize(text)
print(words)

['나', '##는', '내일', '야구', '##를', '관람', '##할', '예정', '##입니다', '.']


In [12]:
msk_idx = 3
words[msk_idx] = "[MASK]"
print(words)

['나', '##는', '내일', '[MASK]', '##를', '관람', '##할', '예정', '##입니다', '.']


In [13]:
import torch

word_ids = tokenizer.convert_tokens_to_ids(words)
word_tensor = torch.tensor([word_ids])
print(word_tensor)

tensor([[  717,  2259,  5420,     4,  2138,  5607,  2085,  3834, 12190,    18]])


In [14]:
from transformers import BertForMaskedLM

msk_model = BertForMaskedLM.from_pretrained("klue/bert-base")

msk_model.eval()



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [15]:
x = word_tensor
y = msk_model(x)
result = y[0]
print(result.size())


torch.Size([1, 10, 32000])


In [17]:
_, max_ids = torch.topk(result[0][msk_idx], k=5)
result_words = tokenizer.convert_ids_to_tokens(max_ids.tolist())
print(result_words)

['영화', '경기', '드라마', '전시', '전시회']


In [18]:
# 문장이 연속되어 있는지 판정
from transformers import BertForNextSentencePrediction

nsp_model = BertForNextSentencePrediction.from_pretrained("klue/bert-base")
nsp_model.eval()


Some weights of the model checkpoint at klue/bert-base were not used when initializing BertForNextSentencePrediction: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForNextSentencePrediction(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [19]:
def show_continuity(text1, text2):
    tokenized = tokenizer(text1, text2, return_tensors="pt")
    print("Tokenized:", tokenized)
    
    y = nsp_model(**tokenized)
    print("Result:", y)
    pred = torch.softmax(y.logits, dim=1)
    print(str(pred[0][0].item()*100) + "% 확률로 연속되는 문장입니다.")

In [29]:
text1 = "나는 내일 야구를 관람할 예정입니다."
text2 = "그냥 그렇다고요."
show_continuity(text1, text2)

Tokenized: {'input_ids': tensor([[    2,   717,  2259,  5420,  4878,  2138,  5607,  2085,  3834, 12190,
            18,     3,  4181,  3649,  4683,  2182,    18,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
Result: NextSentencePredictorOutput(loss=None, logits=tensor([[-1.0482,  1.4184]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
7.823008298873901% 확률로 연속되는 문장입니다.
