### Default Setting

In [5]:
import os
import pandas as pd 
import numpy as np 
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

In [2]:
default_path = os.getcwd()

In [19]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

### Hugging Face Embedding

In [41]:
model_name = 'kakaobank/kf-deberta-base'

In [42]:
sentences = ["새로운 세상"]

In [43]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [44]:
len(tokenizer.vocab)

130000

In [45]:
tokenizer

BertTokenizerFast(name_or_path='kakaobank/kf-deberta-base', vocab_size=130000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [46]:
# Tokenize sentences
encoded_input = tokenizer("새로운 세상", padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("새로운 세상")
print(sentence_embeddings[0][:5], np.shape(sentence_embeddings))

새로운 세상
tensor([-0.5292,  0.8925, -1.7507, -0.4305,  0.0954]) torch.Size([1, 768])


In [27]:
tokenized_sequence = tokenizer.tokenize(sentences[0])
tokenized_sequence

['새로운', '세상']

In [28]:
encoded_token = tokenizer.encode(sentences[0])
encoded_token

[2, 750, 1168, 3]

In [29]:
tokenizer.convert_ids_to_tokens(encoded_token)

['[CLS]', '새로운', '세상', '[SEP]']

In [30]:
tokenizer.decode(encoded_token)

'[CLS] 새로운 세상 [SEP]'

### Sentence Transformers Embedding

In [31]:
# sentences = ["한국어 문장 임베딩을 위한 버트 모델입니다."]

model = SentenceTransformer(model_name)
embeddings = model.encode(sentences)
print(embeddings[0][:5])

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/kakaobank_kf-deberta-base. Creating a new one with MEAN pooling.


[-0.5291525   0.8925469  -1.7507113  -0.4304517   0.09544729]


### Llama Index

In [37]:
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.embeddings import InstructorEmbedding

model_name = 'kakaobank/kf-deberta-base'
embed_model = HuggingFaceEmbedding(model_name=model_name)
# embed_model = InstructorEmbedding(model_name=model_name)

In [38]:
embeddings = embed_model.get_text_embedding("새로운 세상")
print(len(embeddings), np.shape(embeddings))
print(embeddings)

768 (768,)
[-0.022232957184314728, 0.031181715428829193, -0.05263470858335495, 0.01744307205080986, 0.0280600693076849, 0.031108316034078598, 0.0047212145291268826, -0.005315764807164669, 0.02909689024090767, -0.008604301139712334, 0.059841595590114594, -0.013541297987103462, -0.060027141124010086, 0.038263797760009766, 0.010539392940700054, -0.06019851937890053, 0.013333876617252827, 0.008127684704959393, -0.01922658644616604, 0.011793178506195545, 0.010151952505111694, 0.0355193130671978, 0.06154009699821472, 0.03740737587213516, -0.024221191182732582, -0.044349364936351776, -0.056973155587911606, -0.02125147357583046, 0.05791948363184929, -0.00566586060449481, -0.049670640379190445, -0.05035970360040665, 0.019648635759949684, -0.01324515137821436, 0.025804325938224792, 0.007650724146515131, 0.06902992725372314, 0.041770245879888535, -0.014580021612346172, 0.03781870752573013, -0.022282419726252556, -0.02575654909014702, 0.03679138422012329, -0.042037222534418106, 0.01446717418730259

### langchain

In [35]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import ServiceContext

embed_model = HuggingFaceBgeEmbeddings(model_name=model_name)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
service_context

No sentence-transformers model found with name /root/.cache/torch/sentence_transformers/kakaobank_kf-deberta-base. Creating a new one with MEAN pooling.


ValueError: 
******
Could not load OpenAI model. If you intended to use OpenAI, please check your OPENAI_API_KEY.
Original error:
No API key found for OpenAI.
Please set either the OPENAI_API_KEY environment variable or openai.api_key prior to initialization.
API keys can be found or created at https://platform.openai.com/account/api-keys

To disable the LLM entirely, set llm=None.
******

In [None]:
embeddings = embed_model.embed_query(sentences[0])
np.shape(embeddings), embeddings[:5]