In [1]:
pip install transformers


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/7.7 MB[0m [31m4.4 MB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/7.7 MB[0m [31m25.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m7.1/7.7 MB[0m [31m69.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.7/7.7 MB[0m [31m73.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # Bert-base의 토크나이저


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
result = tokenizer.tokenize('Here is the sentence I want embeddings for.')
print(result)


['here', 'is', 'the', 'sentence', 'i', 'want', 'em', '##bed', '##ding', '##s', 'for', '.']


In [4]:
print(tokenizer.vocab['here'])


2182


In [5]:
print(tokenizer.vocab['embeddings'])


KeyError: ignored

In [6]:
print(tokenizer.vocab['em'])


7861


In [7]:
print(tokenizer.vocab['##bed'])


8270


In [8]:
print(tokenizer.vocab['##ding'])


4667


In [9]:
print(tokenizer.vocab['##s'])


2015


In [10]:
# BERT의 단어 집합을 vocabulary.txt에 저장
with open('vocabulary.txt', 'w') as f:
  for token in tokenizer.vocab.keys():
    f.write(token + '\n')


In [11]:
df = pd.read_fwf('vocabulary.txt', header=None)
df


Unnamed: 0,0
0,[PAD]
1,[unused0]
2,[unused1]
3,[unused2]
4,[unused3]
...,...
30517,##．
30518,##／
30519,##：
30520,##？


In [12]:
print('단어 집합의 크기 :',len(df))


단어 집합의 크기 : 30522


In [13]:
df.loc[4667].values[0]


'##ding'

In [14]:
df.loc[102].values[0]


'[SEP]'

In [15]:
from transformers import TFBertForMaskedLM
from transformers import AutoTokenizer


In [16]:
model = TFBertForMaskedLM.from_pretrained('bert-large-uncased') #BERT를 마스크드 언어 모델 형태로 로드
tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased") #해당 모델이 학습되었을 당시에 사용되었던 토크나이저를 로드


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForMaskedLM.

All the weights of TFBertForMaskedLM were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [17]:
inputs = tokenizer('Soccer is a really fun [MASK].', return_tensors='tf')


In [18]:
print(inputs['input_ids']) #정수 인코딩 결과 확인


tf.Tensor([[ 101 4715 2003 1037 2428 4569  103 1012  102]], shape=(1, 9), dtype=int32)


In [19]:
print(inputs['token_type_ids']) #세그먼트 인코딩 결과 확인


tf.Tensor([[0 0 0 0 0 0 0 0 0]], shape=(1, 9), dtype=int32)


In [20]:
print(inputs['attention_mask'])


tf.Tensor([[1 1 1 1 1 1 1 1 1]], shape=(1, 9), dtype=int32)


In [21]:
from transformers import FillMaskPipeline
pip = FillMaskPipeline(model=model, tokenizer=tokenizer)


In [23]:
pip('Soccer is a really fun [MASK].') #MASK]의 위치에 들어갈 수 있는 상위 5개의 후보 단어들을 출력


[{'score': 0.7621122598648071,
  'token': 4368,
  'token_str': 'sport',
  'sequence': 'soccer is a really fun sport.'},
 {'score': 0.20341946184635162,
  'token': 2208,
  'token_str': 'game',
  'sequence': 'soccer is a really fun game.'},
 {'score': 0.012208531610667706,
  'token': 2518,
  'token_str': 'thing',
  'sequence': 'soccer is a really fun thing.'},
 {'score': 0.0018630254780873656,
  'token': 4023,
  'token_str': 'activity',
  'sequence': 'soccer is a really fun activity.'},
 {'score': 0.0013354856055229902,
  'token': 2492,
  'token_str': 'field',
  'sequence': 'soccer is a really fun field.'}]

In [24]:
pip('The Avengers is a really fun [MASK].')


[{'score': 0.256289541721344,
  'token': 2265,
  'token_str': 'show',
  'sequence': 'the avengers is a really fun show.'},
 {'score': 0.17284125089645386,
  'token': 3185,
  'token_str': 'movie',
  'sequence': 'the avengers is a really fun movie.'},
 {'score': 0.11107686907052994,
  'token': 2466,
  'token_str': 'story',
  'sequence': 'the avengers is a really fun story.'},
 {'score': 0.0724899172782898,
  'token': 2186,
  'token_str': 'series',
  'sequence': 'the avengers is a really fun series.'},
 {'score': 0.07046637684106827,
  'token': 2143,
  'token_str': 'film',
  'sequence': 'the avengers is a really fun film.'}]

In [25]:
pip('I went to [MASK] this morning.')


[{'score': 0.35730746388435364,
  'token': 2147,
  'token_str': 'work',
  'sequence': 'i went to work this morning.'},
 {'score': 0.23304426670074463,
  'token': 2793,
  'token_str': 'bed',
  'sequence': 'i went to bed this morning.'},
 {'score': 0.12845073640346527,
  'token': 2082,
  'token_str': 'school',
  'sequence': 'i went to school this morning.'},
 {'score': 0.062305748462677,
  'token': 3637,
  'token_str': 'sleep',
  'sequence': 'i went to sleep this morning.'},
 {'score': 0.04695260152220726,
  'token': 2465,
  'token_str': 'class',
  'sequence': 'i went to class this morning.'}]

In [26]:
inputs = tokenizer('축구는 정말 재미있는 [MASK]다.', return_tensors='tf')


In [27]:
print(inputs['input_ids'])


tf.Tensor(
[[  101  1465 30014 30020 29991 30014 29992 30017 30021  1464 30008 30025
  29995 30006 30022   100   103  1457 30006  1012   102]], shape=(1, 21), dtype=int32)


In [28]:
print(inputs['token_type_ids'])


tf.Tensor([[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]], shape=(1, 21), dtype=int32)


In [29]:
print(inputs['attention_mask'])


tf.Tensor([[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]], shape=(1, 21), dtype=int32)


In [30]:
pip('축구는 정말 재미있는 [MASK]다.')


[{'score': 0.27445080876350403,
  'token': 100,
  'token_str': '[UNK]',
  'sequence': '축구는 정말 다.'},
 {'score': 0.26436376571655273,
  'token': 1011,
  'token_str': '-',
  'sequence': '축구는 정말 - 다.'},
 {'score': 0.21644115447998047,
  'token': 1012,
  'token_str': '.',
  'sequence': '축구는 정말. 다.'},
 {'score': 0.10372231155633926,
  'token': 1010,
  'token_str': ',',
  'sequence': '축구는 정말, 다.'},
 {'score': 0.028204716742038727,
  'token': 1024,
  'token_str': ':',
  'sequence': '축구는 정말 : 다.'}]

In [31]:
pip('어벤져스는 정말 재미있는 [MASK]다.')


[{'score': 0.36621204018592834,
  'token': 100,
  'token_str': '[UNK]',
  'sequence': '어벤져스는 정말 다.'},
 {'score': 0.21624815464019775,
  'token': 1011,
  'token_str': '-',
  'sequence': '어벤져스는 정말 - 다.'},
 {'score': 0.1678389459848404,
  'token': 1012,
  'token_str': '.',
  'sequence': '어벤져스는 정말. 다.'},
 {'score': 0.09259095788002014,
  'token': 1010,
  'token_str': ',',
  'sequence': '어벤져스는 정말, 다.'},
 {'score': 0.03354845568537712,
  'token': 1025,
  'token_str': ';',
  'sequence': '어벤져스는 정말 ; 다.'}]

In [32]:
pip('나는 오늘 아침에 [MASK]에 출근을 했다.')


[{'score': 0.8567402958869934,
  'token': 100,
  'token_str': '[UNK]',
  'sequence': '나는 오늘 아침에 에 출근을.'},
 {'score': 0.05547768250107765,
  'token': 1010,
  'token_str': ',',
  'sequence': '나는 오늘 아침에, 에 출근을.'},
 {'score': 0.02046068385243416,
  'token': 1012,
  'token_str': '.',
  'sequence': '나는 오늘 아침에. 에 출근을.'},
 {'score': 0.016539018601179123,
  'token': 30021,
  'token_str': '##ᆫ',
  'sequence': '나는 오늘 아침엔 에 출근을.'},
 {'score': 0.011898699216544628,
  'token': 30022,
  'token_str': '##ᆯ',
  'sequence': '나는 오늘 아침엘 에 출근을.'}]

In [33]:
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "pizza is eaten with the use of a knife and fork. In casual settings, however, it is cut into wedges to be eaten while held in the hand."


In [34]:
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')


In [35]:
print(encoding['input_ids'])


tf.Tensor(
[[  101  1999  3304  1010 10733  2366  1999  5337 10906  1010  2107  2004
   2012  1037  4825  1010  2003  3591  4895 14540  6610  2094  1012   102
  10733  2003  8828  2007  1996  2224  1997  1037  5442  1998  9292  1012
   1999 10017 10906  1010  2174  1010  2009  2003  3013  2046 17632  2015
   2000  2022  8828  2096  2218  1999  1996  2192  1012   102]], shape=(1, 58), dtype=int32)


In [36]:
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)


[CLS] : 101
[SEP] : 102


In [37]:
print(tokenizer.decode(encoding['input_ids'][0]))


[CLS] in italy, pizza served in formal settings, such as at a restaurant, is presented unsliced. [SEP] pizza is eaten with the use of a knife and fork. in casual settings, however, it is cut into wedges to be eaten while held in the hand. [SEP]


In [38]:
print(encoding['token_type_ids'])


tf.Tensor(
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
  1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]], shape=(1, 58), dtype=int32)


In [40]:
import tensorflow as tf

In [41]:
logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print(probs)


tf.Tensor(
[[[1.4663878e-07 1.4792737e-07 1.3966401e-07 ... 2.9870520e-07
   3.0156926e-07 2.3302587e-06]
  [9.5902127e-13 2.0978334e-12 9.9812422e-13 ... 8.4172461e-12
   3.6944799e-13 6.0049652e-12]
  [1.5615549e-12 8.6735762e-13 8.6946785e-13 ... 1.7733931e-12
   5.3415979e-13 3.2660894e-12]
  ...
  [6.8529036e-16 1.3496586e-16 2.9930357e-16 ... 9.3405118e-16
   3.0745509e-16 2.6218306e-15]
  [4.4820939e-15 7.9676730e-15 5.1744290e-15 ... 3.9187081e-14
   4.1774043e-15 4.3610667e-14]
  [2.8086382e-11 2.2646348e-11 2.2386265e-11 ... 1.0814193e-10
   2.1653597e-12 9.6843186e-11]]], shape=(1, 58, 30522), dtype=float32)


In [42]:
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())


최종 예측 레이블 : [[ 1996  1999  3304  1010 10733  2366  1999  5337 10906  1010  2107  2004
   2012  1037  4825  1010  2003  2366  4895  1012  6610  3013  1012 10733
  10733  2003  8828  2007  1012  2224  1997  1010  5442  1010  9292  1012
   1999 10017 10906  1010  2174  1010  2009  2003  3013  2046 17632  1010
   2000  2022  8828  2096  2218  1010  1996  2192  1012 10733]]


In [43]:
# 상관없는 두 개의 문장
prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
next_sentence = "The sky is blue due to the shorter wavelength of blue light."
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]

softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())


최종 예측 레이블 : [[ 1996  1999  3304  1996 10733  2366  1999  5337 10906  1996  2107  2004
   2012  1037  4825  1010  2003  2366  4895  1012  6610  2317  1012  1999
   1996  3712  2003  2630  2138  1012  1012  7820 19934  1996  2630  2422
   1012  3011]]


In [44]:
# 이어지는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "여행을 가보니 한국의 2002년 월드컵 축구대회의 준비는 완벽했습니다."
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]

softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())


최종 예측 레이블 : [[ 1012  1012  1011 30010 30021  1463  1011 30022 29993 30017 30002 30008
  30024   100  1463 30019 30022 29996 30011 30021 29991 30012  1455 30011
  30025 29993 30011 30025 29999 30017 29994 30011   100   100  1466 30017
  30021  1464 30006 30021 30001 30019 29999 30019 30024 29992 30019  1012
  30006  1012   100  1463 30010 30005 30007 30025 29999 30017 30022  1455
  30006 29996 30011 29992 30019  1469 30006  1011 29991 30014 30020 29999
  30018  2526 29992 30010 30021  1463 30015 30022 29993 30017 30002 30008
  30024   100  1464 30014 30021 29996 30019 29992 30017 30021   100  1012
   1012]]


In [45]:
# 상관없는 두 개의 문장
prompt = "2002년 월드컵 축구대회는 일본과 공동으로 개최되었던 세계적인 큰 잔치입니다."
next_sentence = "극장가서 로맨스 영화를 보고싶어요"
encoding = tokenizer(prompt, next_sentence, return_tensors='tf')

logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]

softmax = tf.keras.layers.Softmax()
probs = softmax(logits)
print('최종 예측 레이블 :', tf.math.argmax(probs, axis=-1).numpy())


최종 예측 레이블 : [[ 1996  2526  1012 30010 30021  1463 30015 30022 29993 30017 29994 30008
  30024   100  1463 30019 30022 29996 30011 30021 29991 30012  1455 30011
  30025 29993 30011 30025 29999 30017 29994 30011   100   100  1466 30017
  30021  1464 30006 30021 30001 30019 29999 30019 30024 29992 30019  1012
  30006  1012  2201  1455 30017 30020 30000 30006 30025 29991 30006 29997
  30017 29994 30011 29995 30017 30021 29997 30017  1463 30010 30025 30005
  30012 29994 30017 30022   100  1012]]
