In [3]:
import numpy as np
import json
import torch

In [24]:
from sklearn.metrics import ndcg_score

In [12]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.1


In [13]:
from sentence_transformers import SentenceTransformer, util

  from tqdm.autonotebook import tqdm, trange


In [4]:
def dcg_at_k(relevance_scores, k):
    relevance_scores = np.array(relevance_scores)[:k]
    return np.sum(relevance_scores / np.log2(np.arange(2, relevance_scores.size + 2)))


In [5]:
def ndcg_at_k(relevance_scores, k):
    actual_dcg = dcg_at_k(relevance_scores, k)

    ideal_relevance_scores = sorted(relevance_scores, reverse=True)
    ideal_dcg = dcg_at_k(ideal_relevance_scores, k)

    return actual_dcg / ideal_dcg if ideal_dcg > 0 else 0

In [14]:
small_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
large_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [15]:
with open("/content/drive/MyDrive/qrels.json", "r") as f:
    qrels = json.load(f)

with open("/content/drive/MyDrive/corpus.json", "r") as f:
    corpus = json.load(f)

with open("/content/drive/MyDrive/queries.json", "r") as f:
    queries = json.load(f)

In [16]:
large_corpus_embeddings = torch.load('/content/drive/MyDrive/large_corpus_embeddings.pt', map_location=torch.device('cpu'))
small_corpus_embeddings = torch.load('/content/drive/MyDrive/small_corpus_embeddings.pt', map_location=torch.device('cpu'))

  large_corpus_embeddings = torch.load('/content/drive/MyDrive/large_corpus_embeddings.pt', map_location=torch.device('cpu'))
  small_corpus_embeddings = torch.load('/content/drive/MyDrive/small_corpus_embeddings.pt', map_location=torch.device('cpu'))


In [23]:
def retrieve_candidates(query, model, corpus_embeddings, top_k=10):
    query_embedding = model.encode(query, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)
    top_results = torch.topk(scores, k=top_k)
    return top_results

In [21]:
def evaluate_ndcg_at_k(top_k=10):
    ndcg_scores = []

    corpus_ids = list(corpus.keys())

    # Loop over all queries
    for query_id, query_text in queries.items():
        # Retrieve top-k candidates using the small model (you can switch to large model)
        top_results = retrieve_candidates(query_text, small_model, small_corpus_embeddings, top_k=top_k)

        # Extract retrieved document ids and their scores
        retrieved_doc_ids = [corpus_ids[idx.item()] for idx in top_results[1][0]]
        retrieved_scores = [score.item() for score in top_results[0][0]]

        # Get ground truth relevance scores for the query
        true_relevance_scores = np.zeros(len(corpus))  # Initialize with zeros for all docs
        if query_id in qrels:
            for doc_id, relevance in qrels[query_id].items():
                if doc_id in corpus_ids:
                    true_relevance_scores[corpus_ids.index(doc_id)] = relevance

        # Reorder the true relevance scores based on the retrieved docs
        true_relevance_for_top_k = [true_relevance_scores[corpus_ids.index(doc_id)] for doc_id in retrieved_doc_ids]

        # Calculate NDCG@k for the query
        ndcg = ndcg_score([true_relevance_for_top_k], [retrieved_scores], k=top_k)
        ndcg_scores.append(ndcg)

    # Return the mean NDCG@k
    return np.mean(ndcg_scores)

In [25]:
ndcg_at_10 = evaluate_ndcg_at_k(top_k=10)

In [26]:
print(f"Mean NDCG@10: {ndcg_at_10}")

Mean NDCG@10: 0.4853570836940615
