# Overview

SentenceTransformer library with so much dependency. It is almost 9.8GB in a container image. So, let's see other way to do the embeddings.

In [1]:
%%capture
!pip install transformers==4.39.1
!pip install sentence-transformers==2.6.0

In [2]:
import torch
from transformers import AutoTokenizer, AutoModel

# Choose a transformer model (e.g., BERT)
model_name = "bert-base-uncased"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to('cuda')

# Example sentence
sentence = "The weather is so cold in Melbourne today."

# Tokenization
input_ids = tokenizer.encode(sentence, return_tensors="pt").to('cuda')

# Transformer encoding
with torch.no_grad():
    outputs = model(input_ids)
    last_hidden_states = outputs.last_hidden_state
    
# Sentence embedding (e.g., mean pooling)
sentence_embedding = torch.mean(last_hidden_states, dim=1)

print(sentence_embedding.shape)  # Output: torch.Size([1, 768])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

torch.Size([1, 768])


# SentenceTransformers

In [3]:
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import normalize_embeddings

encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2").to('cuda')

embeddings = encoder.encode(sentence,show_progress_bar=True,convert_to_tensor=True).to('cuda')

# TODO normalization
print(embeddings.shape)

# Print the embeddings
# for sentence, embedding in zip(sentences, embeddings):
#     print("Sentence:", sentence)
#     print("Embedding:", embedding)
#     print("")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

torch.Size([768])


# Transformers

First, we pass our input through the transformer model, then we have to apply the right pooling-operation on-top of the contextualized word embeddings.

In [4]:
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)



# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')

# Tokenize sentences
encoded_input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings.shape)


Sentence embeddings:
torch.Size([1, 768])
