In [2]:
# !pip install transformers

In [1]:
import numpy as np
import tensorflow as tf
from transformers import TFBertForMaskedLM, AutoTokenizer



# transformers의 TFBertForMaskedLM 라이브러리 이용해서 빈칸 채우기

In [3]:
# bert 기본 소문자 모델
model = TFBertForMaskedLM.from_pretrained('bert-base-uncased')

All model checkpoint layers were used when initializing TFBertForMaskedLM.

All the layers of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


In [2]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [6]:
# word2ind와 ind2word
vocab = tokenizer.get_vocab()

print(len(vocab))
vocab

30522


{'detroit': 5626,
 'longtime': 11155,
 'onion': 20949,
 'transporting': 18276,
 '##見': 30474,
 'grins': 20237,
 'high': 2152,
 'rejection': 13893,
 'coordinate': 13530,
 'sucker': 26476,
 '340': 16029,
 'fen': 21713,
 'ponce': 21085,
 'theorists': 28442,
 '##mada': 23574,
 'bk': 23923,
 '[unused303]': 308,
 'pas': 14674,
 'nicola': 17388,
 'harp': 14601,
 'psychiatry': 18420,
 'cy': 22330,
 '##sin': 11493,
 'fowler': 14990,
 '[unused275]': 280,
 'grandpa': 15310,
 '₈': 1555,
 'occasional': 8138,
 '##mouth': 14359,
 'repression': 22422,
 'finalized': 23575,
 '##●': 30146,
 'schuster': 24253,
 'discontinued': 8944,
 'α': 1155,
 'warrior': 6750,
 'slot': 10453,
 'hancock': 13849,
 'pennsylvania': 3552,
 'hayes': 10192,
 'subsidiary': 7506,
 '##pw': 28400,
 'gotten': 5407,
 'universe': 5304,
 'noah': 7240,
 'cited': 6563,
 'adopt': 11092,
 'surveying': 19654,
 '##dier': 24612,
 'hyun': 21108,
 'ivy': 7768,
 'convergence': 19143,
 'flotilla': 17150,
 'citizenship': 9068,
 '##thy': 16921,
 '

In [15]:
vocab['[MASK]']

103

In [10]:
vocab['detroit']

5626

In [7]:
# index to word
ind2word = {i: word for word, i in vocab.items()}

In [9]:
ind2word[5626]

'detroit'

In [16]:
ind2word[103]

'[MASK]'

In [18]:
ind2word[102]

'[SEP]'

In [19]:
ind2word[101]

'[CLS]'

In [11]:
tokenizer.mask_token

'[MASK]'

In [20]:
# 문장 토큰화
inputs = tokenizer.encode('Pizza is my [MASK] food.', return_tensors='tf')

inputs

<tf.Tensor: shape=(1, 8), dtype=int32, numpy=
array([[  101, 10733,  2003,  2026,   103,  2833,  1012,   102]],
      dtype=int32)>

In [22]:
inputs.numpy()

array([[  101, 10733,  2003,  2026,   103,  2833,  1012,   102]],
      dtype=int32)

In [23]:
for word in inputs.numpy()[0]:
    print(ind2word[word])

[CLS]
pizza
is
my
[MASK]
food
.
[SEP]


In [None]:
# [MASK]는 4번 토큰 -> 4번 토큰을 예측하는 문제

In [24]:
# 단어 집합 개수
tokenizer.vocab_size

30522

In [25]:
# 특수 토큰 cls : 문장의 시작
print(tokenizer.cls_token_id)

101


In [26]:
# 특수 토큰 sep : 문장 구분
print(tokenizer.sep_token_id)

102


In [27]:
# 토큰화된 문장을 model에 넣기
result = model(inputs)
result

TFMaskedLMOutput(loss=None, logits=<tf.Tensor: shape=(1, 8, 30522), dtype=float32, numpy=
array([[[ -6.5892477,  -6.545541 ,  -6.550172 , ...,  -5.9745717,
          -5.7345476,  -4.015825 ],
        [ -7.989972 ,  -7.8295565,  -7.847025 , ...,  -7.580194 ,
          -6.8277597,  -6.4859447],
        [-12.688309 , -12.100912 , -12.303783 , ..., -10.958016 ,
          -9.478661 , -11.172879 ],
        ...,
        [-10.789201 , -11.01828  , -10.619027 , ...,  -9.310557 ,
          -7.95036  , -12.606779 ],
        [-11.8699875, -11.202811 , -11.640639 , ...,  -8.015509 ,
          -9.464375 ,  -7.8128185],
        [-16.216164 , -16.04058  , -16.043331 , ..., -16.037857 ,
         -14.760496 , -13.951068 ]]], dtype=float32)>, hidden_states=None, attentions=None)

In [29]:
# 문장의 로짓(=확률) 값
logits = result[0]
logits

<tf.Tensor: shape=(1, 8, 30522), dtype=float32, numpy=
array([[[ -6.5892477,  -6.545541 ,  -6.550172 , ...,  -5.9745717,
          -5.7345476,  -4.015825 ],
        [ -7.989972 ,  -7.8295565,  -7.847025 , ...,  -7.580194 ,
          -6.8277597,  -6.4859447],
        [-12.688309 , -12.100912 , -12.303783 , ..., -10.958016 ,
          -9.478661 , -11.172879 ],
        ...,
        [-10.789201 , -11.01828  , -10.619027 , ...,  -9.310557 ,
          -7.95036  , -12.606779 ],
        [-11.8699875, -11.202811 , -11.640639 , ...,  -8.015509 ,
          -9.464375 ,  -7.8128185],
        [-16.216164 , -16.04058  , -16.043331 , ..., -16.037857 ,
         -14.760496 , -13.951068 ]]], dtype=float32)>

In [30]:
logits[0][4]  # 0번째 문장, 4번 토큰(=MASK)의 단어 별 logits(=확률)

<tf.Tensor: shape=(30522,), dtype=float32, numpy=
array([-5.408284 , -5.7645416, -5.3103952, ..., -5.7365346, -4.585004 ,
       -4.6279488], dtype=float32)>

In [31]:
# 가장 확률값이 높은 단어 k개
k_words = tf.math.top_k(logits[0][4], k=5).indices.numpy()
k_words

array([ 5440,  8837,  7216,  6871, 18785], dtype=int32)

In [32]:
for i in range(0, len(k_words)):
    print(ind2word[k_words[i]])

favorite
favourite
comfort
preferred
staple


In [33]:
# 토큰 디코드(index2word)
tokenizer.decode(k_words)

'favorite favourite comfort preferred staple'

# transformers의 pipeline 이용해서 빈칸 채우기

In [34]:
from transformers import pipeline

In [35]:
# bert 모델 다운받아서 학습하기

In [36]:
pip = pipeline('fill-mask', model='bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [37]:
pip('Pizza is my [MASK] food')

[{'score': 0.9569321870803833,
  'token': 5440,
  'token_str': 'favorite',
  'sequence': 'pizza is my favorite food'},
 {'score': 0.032709430903196335,
  'token': 8837,
  'token_str': 'favourite',
  'sequence': 'pizza is my favourite food'},
 {'score': 0.0019699952099472284,
  'token': 7216,
  'token_str': 'comfort',
  'sequence': 'pizza is my comfort food'},
 {'score': 0.0017387170810252428,
  'token': 6871,
  'token_str': 'preferred',
  'sequence': 'pizza is my preferred food'},
 {'score': 0.0011286975350230932,
  'token': 2069,
  'token_str': 'only',
  'sequence': 'pizza is my only food'}]

In [113]:
# 원하는 모델과 토크나이저 지정해서 학습하기

In [38]:
from transformers import FillMaskPipeline

In [39]:
pip2 = FillMaskPipeline(model=model, tokenizer=tokenizer)

In [40]:
pip2('Pizza is my [MASK] food')

[{'score': 0.9569203853607178,
  'token': 5440,
  'token_str': 'favorite',
  'sequence': 'pizza is my favorite food'},
 {'score': 0.03270899876952171,
  'token': 8837,
  'token_str': 'favourite',
  'sequence': 'pizza is my favourite food'},
 {'score': 0.0019699616823345423,
  'token': 7216,
  'token_str': 'comfort',
  'sequence': 'pizza is my comfort food'},
 {'score': 0.001738694030791521,
  'token': 6871,
  'token_str': 'preferred',
  'sequence': 'pizza is my preferred food'},
 {'score': 0.0011286772787570953,
  'token': 2069,
  'token_str': 'only',
  'sequence': 'pizza is my only food'}]