# 패키지 설치

In [None]:
!pip install ratsnlp

# 토크나이저 초기화

[Beomi의 KcBERT](https://github.com/Beomi/KcBERT)를 사용한다.


In [2]:
from transformers import BertTokenizer

In [3]:
# WordPiece를 기반으로한 BERT 토크나이저를 구성한다.
tokenizer = BertTokenizer.from_pretrained(
    "beomi/kcbert-base",
    do_lower_case=False,
)

Downloading:   0%|          | 0.00/250k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/619 [00:00<?, ?B/s]

# 모델 초기화

In [4]:
from transformers import BertConfig, BertModel

In [5]:
# BERT 모델을 인스턴스화하고 모델 아키텍처를 정의하는 데 사용한다.
pretrained_model_config = BertConfig.from_pretrained(
    "beomi/kcbert-base"
)

# 사전학습시의 모델을 초기화한뒤 체크포인트를 읽어들인다.
model = BertModel.from_pretrained(
    "beomi/kcbert-base",
    config=pretrained_model_config,
)

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/kcbert-base were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


pretrained_model_config 내용 확인

In [6]:
pretrained_model_config

BertConfig {
  "_name_or_path": "beomi/kcbert-base",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 300,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30000
}

# 모델 입력값 만들기

In [7]:
sentences = ["안녕하세요", "처음뵙겠습니다"]
features = tokenizer(
    sentences,
    max_length=10,
    padding="max_length",
    truncation=True,
)

features의 내용 확인

In [8]:
features.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

토큰 인덱스로 변환한 결과

In [9]:
features['input_ids']

[[2, 19017, 8482, 3, 0, 0, 0, 0, 0, 0],
 [2, 8793, 6758, 9560, 3, 0, 0, 0, 0, 0]]

패딩인 토큰의 위치를 확인

In [10]:
features['attention_mask']

[[1, 1, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]

세그먼트 정보 확인

In [11]:
features['token_type_ids']

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

# BERT 임베딩 추출

features를 Tensor로 변환 후 모델에 입력

In [12]:
import torch

In [13]:
features = {k: torch.tensor(v) for k, v in features.items()}

outputs = model(**features)

BERT 마지막 레이어의 단어 수준 벡터들을 확인

In [18]:
outputs.last_hidden_state

tensor([[[-0.6969, -0.8248,  1.7512,  ..., -0.3732,  0.7399,  1.1907],
         [-1.4803, -0.4398,  0.9444,  ..., -0.7405, -0.0211,  1.3064],
         [-1.4299, -0.5033, -0.2069,  ...,  0.1285, -0.2611,  1.6057],
         ...,
         [-1.4406,  0.3431,  1.4043,  ..., -0.0565,  0.8450, -0.2170],
         [-1.3625, -0.2404,  1.1757,  ...,  0.8876, -0.1054,  0.0734],
         [-1.4244,  0.1518,  1.2920,  ...,  0.0245,  0.7572,  0.0080]],

        [[ 0.6129, -0.2066,  1.0601,  ...,  0.5807,  0.1389,  1.0418],
         [ 1.0833,  0.9731, -0.4916,  ...,  0.3177, -0.5573,  2.6693],
         [ 1.1806,  0.8949,  0.2606,  ..., -0.6885,  0.1624,  0.9208],
         ...,
         [ 0.2076,  0.9641,  0.0618,  ...,  0.5240, -0.0198,  0.7647],
         [ 1.9015,  1.4405,  0.2099,  ..., -0.5322,  0.9371,  1.2724],
         [ 0.1634,  1.8192, -0.1976,  ...,  0.4398,  0.1274, -0.2931]]],
       grad_fn=<NativeLayerNormBackward0>)

In [19]:
outputs.last_hidden_state.shape

# 2개의 문장, 시퀀스 길이 10, 768차원의 벡타(hidden_size = 768)

torch.Size([2, 10, 768])

BERT 마지막 레이어의 문서 수준 벡터를 확인

In [20]:
outputs.pooler_output

tensor([[-0.1594,  0.0547,  0.1101,  ...,  0.2684,  0.1596, -0.9828],
        [-0.3123,  0.2789, -0.3831,  ..., -0.3077, -0.0855, -0.9758]],
       grad_fn=<TanhBackward0>)

In [22]:
outputs.pooler_output.shape

# 1개의 문서(2개의 문장), 768차원의 벡타(hidden_size = 768)

torch.Size([2, 768])