<a href="https://colab.research.google.com/github/CogNetSys/ModernBERT/blob/main/QuickStart_Tutorial_04.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required dependencies
!pip install torch transformers numpy sklearn

In [None]:
!pip install --upgrade transformers

In [None]:
# Import necessary libraries
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Model name
model_name = "lightonai/modernbert-embed-large"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
sentences = [
    "ModernBERT is a powerful language model.",
    "This is an example sentence.",
    "Embeddings can capture semantic meaning.",
    "ModernBERT can handle long text sequences efficiently.",
    "Another example sentence for demonstration."
]


In [None]:
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU


In [None]:
with torch.no_grad():  # Disable gradient calculation for efficiency
    outputs = model(**inputs)

# Extract the embeddings of the [CLS] token from the last hidden state
sentence_embeddings = outputs.last_hidden_state[:, 0, :]

print(f"Shape of sentence embeddings: {sentence_embeddings.shape}")


In [None]:
# Calculate cosine similarity between the first sentence and the rest
similarity_scores = cosine_similarity(
    sentence_embeddings[0].cpu().reshape(1, -1),  # Move to CPU and reshape to 2D array
    sentence_embeddings[1:].cpu()  # Move the rest to CPU
)

# Print similarity scores
print("Similarity scores with the first sentence:")
for i, score in enumerate(similarity_scores[0]):
    print(f"  Sentence {i+2}: {score:.4f}")
