# Sentence Bert Without Fine Tuning

In [33]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity

In [43]:
# Load pre-trained embedding model (faster and good enough for product matching)
# - snunlp/KR-SBERT-V40K-klueNLI-augSTS
# - BAAI/bge-small-en
# - BM-K/KoSimCSE-roberta
# - snunlp/KR-BERT

model_name = "BM-K/KoSimCSE-roberta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [60]:
# Example product info
product1 = '신라면'
product2 = '농심 너구리'

texts = [product1, product2]

# Tokenize inputs
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    # Mean pooling
    attention_mask = inputs['attention_mask']
    token_embeddings = outputs.last_hidden_state
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size())
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
    sum_mask = input_mask_expanded.sum(dim=1)
    embeddings = sum_embeddings / sum_mask

    
# Compute cosine similarity
similarity = cosine_similarity(embeddings[0], embeddings[1], dim=0).item()
print(similarity)

0.7095173597335815
