In [1]:
from transformers import AutoModelForMaskedLM, AutoTokenizer

model_id = 'naver/splade-cocondenser-ensembledistil'

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForMaskedLM.from_pretrained(model_id)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
text = (
    "Orangutans are native to the rainforests of "
    "Indonesia and Malaysia"
)

tokens = tokenizer(text, return_tensors='pt')
output = model(**tokens)
output

MaskedLMOutput(loss=None, logits=tensor([[[ -5.9553,  -8.3626,  -7.5742,  ...,  -7.9341,  -7.5938,  -5.1898],
         [ -9.6737, -10.0490,  -9.3391,  ..., -10.1951,  -9.5888,  -7.9000],
         [ -8.2968,  -9.1556,  -8.2258,  ...,  -9.0471,  -8.1802,  -6.8503],
         ...,
         [ -6.2040,  -8.6663,  -8.0194,  ...,  -8.1207,  -7.8005,  -5.0678],
         [ -7.2947,  -9.0419,  -8.0300,  ...,  -8.5999,  -8.2968,  -6.4665],
         [-20.3195, -16.7864, -16.4392,  ..., -16.2584, -15.5940, -17.5674]]],
       grad_fn=<ViewBackward0>), hidden_states=None, attentions=None)

In [5]:
output.logits.shape  # Check the shape of the output logits

torch.Size([1, 16, 30522])

In [6]:
import torch

vec = torch.max(
    torch.log(
        1 + torch.relu(output.logits)
    ) * tokens.attention_mask.unsqueeze(-1),
dim=1)[0].squeeze()

vec.shape

torch.Size([30522])

In [10]:
indices = vec.nonzero().squeeze().cpu().tolist() 
weights = vec[indices].cpu().tolist()

sparse_vector = dict(zip(indices, weights))
sparse_vector

{2013: 0.160134419798851,
 2024: 0.3913775384426117,
 2027: 0.4508352279663086,
 2030: 1.2656912803649902,
 2078: 0.4573279917240143,
 2141: 0.0972718670964241,
 2179: 0.25103187561035156,
 2314: 0.14648033678531647,
 2427: 1.0769829750061035,
 2444: 0.666903018951416,
 2479: 0.18931779265403748,
 2555: 0.07044675201177597,
 2634: 0.3164989650249481,
 2660: 0.11532483249902725,
 2688: 0.26555970311164856,
 2859: 0.40288352966308594,
 2900: 0.12337708473205566,
 3088: 0.6084758639335632,
 3128: 1.4293752908706665,
 3224: 1.1159083843231201,
 3290: 0.01179188396781683,
 3295: 0.6689419150352478,
 3392: 0.2691112160682678,
 3562: 0.29504308104515076,
 3899: 0.016659900546073914,
 4021: 0.42494305968284607,
 4111: 0.2776368260383606,
 4290: 0.10944975167512894,
 4743: 0.31513169407844543,
 4761: 0.8566420674324036,
 5202: 0.16360189020633698,
 5430: 0.2713300883769989,
 5532: 0.14111532270908356,
 5654: 1.554614543914795,
 5917: 0.5991864204406738,
 6027: 1.4550087451934814,
 6239: 1.37072

In [12]:
# Let's get human readable plaintext tokens
idx_to_token = {
    idx: token for token, idx in tokenizer.get_vocab().items()
}

sparse_dict_tokens = {
    idx_to_token[idx]: round(weight, 2) for idx, weight in sparse_vector.items()
}

# Sort the sparse vector by weights to see most relevant tokens
sparse_dict_tokens = dict(
    sorted(sparse_dict_tokens.items(), key=lambda item: item[1], reverse=True)
)
sparse_dict_tokens

{'##uta': 1.65,
 '##ang': 1.55,
 'malaysia': 1.46,
 'native': 1.43,
 'indonesia': 1.37,
 'rainforest': 1.34,
 'or': 1.27,
 'habitat': 1.24,
 'forest': 1.12,
 'species': 1.08,
 'indonesian': 0.9,
 'malaysian': 0.87,
 'origin': 0.86,
 'live': 0.67,
 'location': 0.67,
 'africa': 0.61,
 'tribe': 0.6,
 'elephant': 0.55,
 'indigenous': 0.49,
 'breed': 0.47,
 '##n': 0.46,
 'they': 0.45,
 'asia': 0.42,
 'china': 0.4,
 'are': 0.39,
 'tiger': 0.39,
 'monkey': 0.38,
 'india': 0.32,
 'bird': 0.32,
 'belong': 0.32,
 'genus': 0.3,
 'nationality': 0.29,
 'subspecies': 0.29,
 'animal': 0.28,
 'geography': 0.28,
 'museum': 0.27,
 'tree': 0.27,
 'cave': 0.27,
 'borneo': 0.27,
 'zoo': 0.26,
 'found': 0.25,
 'thailand': 0.24,
 'snake': 0.24,
 'bali': 0.22,
 'island': 0.19,
 'from': 0.16,
 'dragon': 0.16,
 'fauna': 0.16,
 'river': 0.15,
 'volcano': 0.15,
 'desert': 0.14,
 'darwin': 0.13,
 'australia': 0.12,
 'japan': 0.12,
 'frog': 0.12,
 'kong': 0.11,
 'born': 0.1,
 'madagascar': 0.09,
 'continent': 0.08,

## Comparing vectors

In [13]:
texts = [
   "Programmed cell death (PCD) is the regulated death of cells within an organism",
   "How is the scheduled death of cells within a living thing regulated?",
   "Photosynthesis is the process of storing light energy as chemical energy in cells"
]

In [14]:
tokens = tokenizer(
    texts, return_tensors='pt',
    padding=True, truncation=True
)
output = model(**tokens)
# aggregate the token-level vecs and transform to sparse
vecs = torch.max(
    torch.log(1 + torch.relu(output.logits)) * tokens.attention_mask.unsqueeze(-1), dim=1
)[0].squeeze().detach().cpu().numpy()
vecs.shape

(3, 30522)

In [20]:
import numpy as np
# Find the cosine similarity between the vectors

sim = np.zeros((vecs.shape[0], vecs.shape[0]))

for i, vec in enumerate(vecs):
    sim[i, :] = np.dot(vec, vecs.T) / (
        np.linalg.norm(vec) * np.linalg.norm(vecs, axis=1)
    )   

In [21]:
sim

array([[0.99999988, 0.54609436, 0.20535855],
       [0.54609436, 1.        , 0.2041188 ],
       [0.20535856, 0.2041188 , 1.        ]])

In [None]:
{
    "qid": "query_id",
    "query": "query text",
    'passages':[{"docid": "document_id", "text": "Document score", "score": "teacher or reranker score"}, ...],
}

# Lets get dimensions of the encoded text

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
sentences = ['search_query: Who is Laurens van Der Maaten?']
embeddings = model.encode(sentences)
print(embeddings.shape)

<All keys matched successfully>


(1, 768)


In [3]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/nomic-embed-text-v1-unsupervised", trust_remote_code=True)
sentences = ['search_query: Who is Laurens van Der Maaten?']
embeddings = model.encode(sentences)
print(embeddings.shape)

<All keys matched successfully>


(1, 768)


In [4]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/modernbert-embed-base", trust_remote_code=True)
sentences = ['search_query: Who is Laurens van Der Maaten?']
embeddings = model.encode(sentences)
print(embeddings.shape)

(1, 768)


In [6]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("nomic-ai/modernbert-embed-base-unsupervised", trust_remote_code=True)
sentences = ['search_query: Who is Laurens van Der Maaten?']
embeddings = model.encode(sentences)
print(embeddings.shape)

No sentence-transformers model found with name nomic-ai/modernbert-embed-base-unsupervised. Creating a new one with mean pooling.


(1, 768)
