In [1]:
%config Completer.use_jedi = False

# Data

## Tokenizer

In [3]:
import os
from pathlib import Path
from tempfile import gettempdir
from zipfile import ZipFile

import boto3
from botocore import UNSIGNED
from botocore.client import Config


def _download(bucket: str, key: str, target):
    cache_path = Path(gettempdir()) / "kobert" / target

    # Download the file
    if not cache_path.parent.exists():
        os.makedirs(cache_path.parent)

    if cache_path.exists():
        return str(cache_path)

    s3 = boto3.client(
        "s3",
        aws_access_key_id=None,
        aws_secret_access_key=None,
        config=Config(signature_version=UNSIGNED),
    )

    with open(cache_path, "wb") as f:
        s3.download_fileobj(bucket, key, f)
    return str(cache_path)


def download_tokenizer():
    bucket = "skt-lsl-nlp-model"
    key = "KoBERT/tokenizers/kobert_news_wiki_ko_cased-1087f8699e.spiece"
    return _download(bucket, key, "kobert-news-wiki.spiece")


tokenizer_path = download_tokenizer()
print("Downloaded Tokenizer Path:", tokenizer_path)

Downloaded Tokenizer Path: /tmp/kobert/kobert-news-wiki.spiece


## Load a Sentencepiece Model

In [5]:
import sentencepiece as stp

tokenizer = stp.SentencePieceProcessor(model_file=tokenizer_path)

print("Encode         :", tokenizer.Encode("치킨은 맛있다"))
print("EncodeAsIds    :", tokenizer.EncodeAsIds("치킨은 맛있다"))
print("EncodeAsPieces :", tokenizer.EncodeAsPieces("치킨은 맛있다"))
print("Decode         :", tokenizer.Decode(tokenizer.Encode("치킨")))

Encode         : [4617, 7576, 7086, 1967, 7143]
EncodeAsIds    : [4617, 7576, 7086, 1967, 7143]
EncodeAsPieces : ['▁치', '킨', '은', '▁맛', '있다']
Decode         : 치킨


## Load a Sentencepiece Model by GluonNLP

In [9]:
import gluonnlp as gnlp

vocab = gnlp.vocab.BERTVocab.from_sentencepiece(
    tokenizer_path, padding_token="[PAD]"
)
tokenizer = gnlp.data.BERTSPTokenizer(tokenizer_path, vocab, lower=False)

In [10]:
tokens = tokenizer("치킨은 맛있다")
ids = tokenizer.convert_tokens_to_ids(tokens)
decodes = vocab.to_tokens(ids)

print("EncodeAsPieces:", tokens)
print("EncodeAsIds   :", ids)
print("Decode        :", decodes)

EncodeAsPieces: ['▁치', '킨', '은', '▁맛', '있다']
EncodeAsIds   : [4617, 7576, 7086, 1967, 7143]
Decode        : ['▁치', '킨', '은', '▁맛', '있다']


# Transforms

https://nlp.gluon.ai/api/modules/data.html

## BERTSentenceTransform

In [87]:
transform = gnlp.data.BERTSentenceTransform(
    tokenizer,  # gluonnlp.data.transforms.BERTSPTokenizer
    max_seq_length=64,  # 문장의 길이
    pad=True,
    pair=False,
)

text = "하나님이 세상을 이처럼 사랑하사 독생자를 주셨으니 이는 그를 믿는 자마다 멸망하지 않고 영생을 얻게 하려 하심이라"
token_ids, valid_length, segment_ids = transform([text])
print('[token_ids]\n', token_ids)
print('\n[valid_length]\n', valid_length)
print('\n[segment_ids]\n', segment_ids)
print('\n[id -> token]\n', tokenizer.vocab.to_tokens(token_ids.tolist()))


[token_ids]
 [   2 4928 5778 7096 2812 3748 2590 7782 6493 1725 6542 7158 4213 6604
 7076 3658 1185 6116  517 6266 5760 3886 6142  517 6202 6165 7819 3149
 3376 6542 7088  517 6869 5400  517 7806 4924 6745 7101    3    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1    1    1    1    1    1    1]

[valid_length]
 40

[segment_ids]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]

[id -> token]
 ['[CLS]', '▁하나', '님', '이', '▁세상을', '▁이처럼', '▁사랑', '하', '사', '▁독', '생', '자를', '▁주', '셨', '으니', '▁이는', '▁그', '를', '▁', '믿', '는', '▁자', '마다', '▁', '멸', '망', '하지', '▁않고', '▁영', '생', '을', '▁', '얻', '게', '▁', '하려', '▁하', '심', '이라', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']


# Model

In [18]:
import torch
from transformers import BertModel


def download_kobert(ctx: str = "cpu"):
    """
    :param cts: "cpu" or "cuda:0"
    """
    device = torch.device(ctx)
    bucket = "skt-lsl-nlp-model"
    key = "KoBERT/models/kobert_v1.zip"
    zip_path = _download(bucket, key, "kobert_v1.zip")
    zip_path = Path(zip_path)
    zipf = ZipFile(zip_path)
    zipf.extractall(path=zip_path.parent)

    model_path = zip_path.parent / "kobert_from_pretrained"
    bertmodel = BertModel.from_pretrained(model_path, return_dict=False)
    bertmodel.to(device)
    bertmodel.eval()
    return bertmodel


bert = download_kobert()
type(bert)

transformers.models.bert.modeling_bert.BertModel

In [117]:
import torch
import torch.nn as F

device = torch.device("cpu")

def predict(text):
    print('Text          :', tokenizer(text[0]))
    print('expected token:', tokenizer.convert_tokens_to_ids(tokenizer(text[0])))
    token_ids, valid_length, segment_ids = transform(text)
    token_ids = torch.Tensor([token_ids]).long().to(device)
    segment_ids = torch.Tensor([segment_ids]).long().to(device)
    valid_length = valid_length
    print('actual   token:', token_ids.tolist())
    print('valid_length  :', valid_length)
    print()

    attention_mask = torch.zeros_like(token_ids)
    attention_mask[0][:valid_length] = 1
    attention_mask = attention_mask.float().to(device)

    _, out = bert(
        input_ids=token_ids,
        token_type_ids=segment_ids,
        attention_mask=attention_mask,
    )
    return out


cos_f = F.CosineSimilarity()

a = predict(["손흥민은 대한민국 국적의 토트넘 홋스퍼 FC 소속 축구선수"])
b = predict(["최상위 제품인 라이젠 9 7950X3D는 16코어, 32스레드로 작동하며 최대 작동 클록은 5.7GHz다"])

print("similarity:", cos_f(a, b).item())
print("MSE :", ((b - a) ** 2).sum().item())

Text          : ['▁손흥민', '은', '▁대한민국', '▁국', '적', '의', '▁토', '트', '넘', '▁', '홋', '스', '퍼', '▁', 'FC', '▁소속', '▁축구', '선수']
expected token: [2866, 7086, 1683, 1132, 7202, 7095, 4737, 7659, 5698, 517, 0, 6664, 7706, 517, 286, 2837, 4562, 6562]
actual   token: [[2, 2866, 7086, 1683, 1132, 7202, 7095, 4737, 7659, 5698, 517, 0, 6664, 7706, 517, 286, 2837, 4562, 6562, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
valid_length  : 20

Text          : ['▁최', '상', '위', '▁제품', '인', '▁', '라이', '젠', '▁9', '▁', '79', '50', 'X', '3', 'D', '는', '▁16', '코', '어', '▁', ',', '▁32', '스', '레드', '로', '▁작동', '하며', '▁최대', '▁작동', '▁클', '록', '은', '▁5', '▁', '.', '▁7', 'G', 'H', 'z', '다']
expected token: [4519, 6527, 7044, 4158, 7119, 517, 6011, 7241, 627, 517, 218, 176, 359, 142, 278, 5760, 545, 7533, 6855, 517, 46, 597, 6664, 6051, 6079, 3934, 7810, 4527, 3934, 4689, 6083, 7086, 611, 517, 54, 621, 290, 294, 459, 5782]
actua

In [109]:
a = predict(["소떡 맛나 치킨"])

Text          : ['▁소', '떡', '▁맛', '나', '▁치', '킨']
expected token: [2822, 5970, 1967, 5655, 4617, 7576]
actual   token: [[2, 2822, 5970, 1967, 5655, 4617, 7576, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
valid_length  : 8


In [103]:
a = predict(["간 장 치 킨 피 자"])

Text          : ['▁간', '▁장', '▁치', '▁', '킨', '▁피', '▁자']
expected token: [777, 3954, 4617, 517, 7576, 4909, 3886]
actual   token: [[2, 777, 3954, 4617, 517, 7576, 4909, 3886, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
valid_length  : 9
