# Text to Embedding with ModernBERT

ModernBERT: "Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference" https://arxiv.org/abs/2412.13663

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F
import torch

In [2]:
model_id = "answerdotai/ModernBERT-base"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Load the model
model = AutoModel.from_pretrained(model_id)

In [3]:
texts = [
    "It's such a pleasure to see you, my friend!",
    "I'm so happy to see you, buddy!",
    "Great to see you again, pal!",
    "This includes setting up effective monitoring and alerting systems."
]

In [4]:
# Tokenize the input texts
encoded_input = tokenizer(
    texts,
    padding=True,  # ensures that all sequences are padded to the same length.
    return_tensors='pt'  # return PyTorch tensors. 
)

In [9]:
encoded_input

{'input_ids': tensor([[50281,  1147,   434,   824,   247, 11284,   281,   923,   368,    13,
           619,  3331,     2, 50282],
        [50281,    42,  1353,   594,  5211,   281,   923,   368,    13, 29517,
             2, 50282, 50283, 50283],
        [50281, 15611,   281,   923,   368,   969,    13,  5796,     2, 50282,
         50283, 50283, 50283, 50283],
        [50281,  1552,  3797,  4758,   598,  3576,  8667,   285, 10028,   272,
          2718,    15, 50282, 50283]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [5]:
# Pass the tokenized inputs through the model to obtain the embeddings.
with torch.no_grad():
    model_output = model(**encoded_input)

In [6]:
# The [CLS] token embedding is the first token's embedding
cls_embeddings = model_output.last_hidden_state[:, 0, :]

In [7]:
cls_embeddings.shape

torch.Size([4, 768])

In [8]:
print(f"'{texts[0]}'\n and '{texts[1]}'\n similarity: %s\n" % F.cosine_similarity(cls_embeddings[0].unsqueeze(0), cls_embeddings[1].unsqueeze(0)))
print(f"'{texts[0]}'\n and '{texts[2]}'\n similarity: %s\n" % F.cosine_similarity(cls_embeddings[0].unsqueeze(0), cls_embeddings[2].unsqueeze(0)))
print(f"'{texts[0]}'\n and '{texts[3]}'\n similarity: %s\n" % F.cosine_similarity(cls_embeddings[0].unsqueeze(0), cls_embeddings[3].unsqueeze(0)))

'It's such a pleasure to see you, my friend!'
 and 'I'm so happy to see you, buddy!'
 similarity: tensor([0.9883])

'It's such a pleasure to see you, my friend!'
 and 'Great to see you again, pal!'
 similarity: tensor([0.9861])

'It's such a pleasure to see you, my friend!'
 and 'This includes setting up effective monitoring and alerting systems.'
 similarity: tensor([0.9139])

