Calculating Similarity

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# 1. Load a Sentence Transformer (NVIDIA often uses 'e5-large' or 'nv-embed')
model_id = "intfloat/e5-large-v2" 
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id)

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Mean Pooling (Average of all token vectors) is standard for sentence embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)
    # CRITICAL: Normalize vectors to length 1 so Dot Product == Cosine Similarity
    return F.normalize(embeddings, p=2, dim=1)

# 2. Define Sentences
query = get_embedding("How do I reset my password?")
doc1  = get_embedding("To recover your account, click 'Forgot Password' on the login page.")
doc2  = get_embedding("NVIDIA H100 GPUs perform fp8 quantization.")

# 3. Calculate Similarity (Dot Product)
score1 = torch.mm(query, doc1.T).item()
score2 = torch.mm(query, doc2.T).item()

print(f"Similarity to Help Doc: {score1:.4f}")  # High (e.g., 0.85)
print(f"Similarity to GPU Doc:  {score2:.4f}")  # Low (e.g., 0.12)

1. The Open Standard (Sentence-Transformers)
NVIDIA released NV-Embed-v2, which currently tops the MTEB leaderboard. To use it correctly, you must use specific prefixes for queries vs. documents. This "Instruction Tuning" is a key exam concept.

Scenario: Local development or preprocessing on a single GPU.

In [None]:
from sentence_transformers import SentenceTransformer
import torch.nn.functional as F

# 1. Load NVIDIA's SOTA Embedding Model
# 'nvidia/NV-Embed-v2' is a massive 7B parameter embedding model.
# For lighter tests, use 'intfloat/e5-large-v2'.
model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)

# 2. Define Inputs with "Instruction Prefixes" (Exam Critical)
# NV-Embed is instruction-tuned. You must tell it *what* the task is.
# Failure to add "Instruct: ... \nQuery: " degrades performance significantly.

task_instruction = "Given a question, retrieve passages that answer the question"

queries = [
    f"Instruct: {task_instruction}\nQuery: How do I enable fp8 in TensorRT-LLM?" 
]

documents = [
    # Documents usually do NOT need instructions, just the raw content
    "FP8 quantization is enabled via the --use_fp8_context_fmha flag in trtllm-build.",
    "TensorRT-LLM supports INT8 smoothquant for A100 GPUs.",
    "The weather in Bengaluru is 24 degrees."
]

# 3. Generate Embeddings
# The model handles tokenization and pooling internally
# max_length=32768 is a feature of NV-Embed (long context support)
query_embeddings = model.encode(queries, max_length=4096, normalize_embeddings=True)
doc_embeddings = model.encode(documents, max_length=4096, normalize_embeddings=True)

# 4. Calculate Similarity (Dot Product)
# Since we normalized (normalize_embeddings=True), Dot Product == Cosine Similarity
scores = model.similarity(query_embeddings, doc_embeddings)

print(f"Similarity Score (FP8 Doc): {scores[0][0]:.4f}") # High
print(f"Similarity Score (Weather): {scores[0][2]:.4f}") # Low

2. The Enterprise Standard (NVIDIA NeMo Retriever / NIM)
In the exam, if the question mentions "deploying a scalable retrieval microservice" or "using NVIDIA AI Enterprise," you do not use the code above. You use NVIDIA NIM (NeMo Inference Microservices).

The architecture changes: You don't load weights. You hit an API.

Scenario: Production RAG pipeline connecting to a Vector DB.

In [None]:
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings

# 1. Initialize the Client
# In production, this points to your self-hosted Triton/NeMo container
# URL would be: "http://localhost:8000/v1"
embedder = NVIDIAEmbeddings(
    model="nvidia/nv-embed-v1", 
    base_url="https://integrate.api.nvidia.com/v1", # or your local NIM IP
    nvidia_api_key="nvapi-..."
)

# 2. Generate Embeddings (Optimized)
# The NIM container handles batching, TensorRT optimization, and hardware scaling automatically.

# A. Embed Query (Automatically adds the 'query' instruction prefix)
query_vec = embedder.embed_query("How to optimize Llama 3 for inference?")

# B. Embed Documents (Automatically uses the 'passage' instruction)
doc_vecs = embedder.embed_documents([
    "Use TensorRT-LLM with in-flight batching.",
    "Use quantization to reduce memory footprint."
])

# 3. Output
print(f"Embedding Dimension: {len(query_vec)}") 
# Output: 4096 (Standard for NV-Embed)