### Default Setting

In [1]:
import os
import pandas as pd 
import numpy as np 
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer, util

In [2]:
default_path = os.getcwd()

In [3]:
# Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

### Hugging Face Embedding

In [15]:
model_name = 'upskyy/kf-deberta-multitask'

In [16]:
sentences = ["금리가 물가에 미치는 영향을 설명해주세요"]

In [17]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/741M [00:00<?, ?B/s]

In [18]:
len(tokenizer.vocab)

130000

In [19]:
tokenizer

BertTokenizerFast(name_or_path='upskyy/kf-deberta-multitask', vocab_size=130000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [20]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling. In this case, mean pooling.
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

print("Sentence embeddings:")
print(sentence_embeddings[0][:5])

Sentence embeddings:
tensor([-0.7254, -0.8106, -0.3931, -1.5228,  0.5842])


In [21]:
tokenized_sequence = tokenizer.tokenize(sentences[0])
tokenized_sequence

['금리',
 '##가',
 '물가',
 '##에',
 '미치',
 '##는',
 '영향',
 '##을',
 '설명',
 '##해',
 '##주',
 '##세요']

In [22]:
encoded_token = tokenizer.encode(sentences[0])
encoded_token

[2,
 505,
 126001,
 1500,
 125999,
 1962,
 125998,
 608,
 126000,
 541,
 126021,
 126042,
 3516,
 3]

In [23]:
tokenizer.convert_ids_to_tokens(encoded_token)

['[CLS]',
 '금리',
 '##가',
 '물가',
 '##에',
 '미치',
 '##는',
 '영향',
 '##을',
 '설명',
 '##해',
 '##주',
 '##세요',
 '[SEP]']

In [24]:
tokenizer.decode(encoded_token)

'[CLS] 금리가 물가에 미치는 영향을 설명해주세요 [SEP]'

### Sentence Transformers Embedding

In [25]:
sentences = ["한국어 문장 임베딩을 위한 버트 모델입니다."]

model = SentenceTransformer(model_name)
embeddings = model.encode(sentences)
print(embeddings[0][:5])

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/6.64k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/118 [00:00<?, ?B/s]

(…)imilarity_evaluation_sts-dev_results.csv:   0%|          | 0.00/1.71k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/741M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/741M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)milarity_evaluation_sts-test_results.csv:   0%|          | 0.00/302 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.09M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

[ 0.93365926 -2.138992   -0.08778528 -1.0112844   0.3569459 ]


### Llama Index

In [None]:
from llama_index.embeddings import HuggingFaceEmbedding

model_name = 'upskyy/kf-deberta-multitask'
embed_model = HuggingFaceEmbedding(model_name=model_name)

### langchain

In [32]:
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import ServiceContext

embed_model = HuggingFaceBgeEmbeddings(model_name=model_name)
service_context = ServiceContext.from_defaults(embed_model=embed_model)
service_context

ServiceContext(llm_predictor=LLMPredictor(system_prompt=None, query_wrapper_prompt=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>), prompt_helper=PromptHelper(context_window=4096, num_output=256, chunk_overlap_ratio=0.1, chunk_size_limit=None, separator=' '), embed_model=LangchainEmbedding(model_name='upskyy/kf-deberta-multitask', embed_batch_size=10, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fc3bd927370>), transformations=[SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fc3bd927370>, id_func=<function default_id_func at 0x7fc36d91da20>, chunk_size=1024, chunk_overlap=200, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')], llama_logger=<llama_index.logger.base.LlamaLogger object at 0x7fc36d79ee30>, callback_manager=<llama_index.callbacks.base.CallbackManager object at 0x7fc3bd927370>)

In [37]:
embeddings = embed_model.embed_query(sentences[0])
np.shape(embeddings), embeddings[:5]

((768,),
 [1.020362138748169,
  -1.7872436046600342,
  -0.18053998053073883,
  -0.7021831274032593,
  0.3475668728351593])