# #Install beir

In [None]:
pip install beir


Collecting beir
  Downloading beir-2.0.0.tar.gz (53 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/53.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers (from beir)
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Collecting pytrec_eval (from beir)
  Downloading pytrec_eval-0.5.tar.gz (15 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting faiss_cpu (from beir)
  Downloading faiss_cpu-1.8.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.7 kB)
Collecting elasticsearch==7.9.1 (from beir)
  Downloading elasticsearch-7.9.1-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting datasets (from beir)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from dat

# #Installing Required Libraries for NLP and Search-Based Tasks

In [None]:
pip install torch transformers sentence-transformers beir faiss-cpu datasets sklearn


Collecting sklearn
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


# #Downloading and Unzipping BEIR Dataset for Benchmarking

In [None]:
from beir import util

dataset_name = "nq"  # Choosing from nq, hotpotqa, fiqa
url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset_name}.zip"
out_dir = f"./datasets/{dataset_name}"
data_path = util.download_and_unzip(url, out_dir)


./datasets/nq/nq.zip:   0%|          | 0.00/475M [00:00<?, ?iB/s]

# #Tokenizing Query and Passage Using Pre-trained Tokenizer

In [None]:
from transformers import AutoTokenizer

# Defining query and passage
query = "What is the capital of France?"  # Example query
passage = "Paris is the capital and most populous city of France."  # Example passage

# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Tokenizing the query and passage
query_tokens = tokenizer(query, return_tensors='pt', max_length=512, truncation=True)
passage_tokens = tokenizer(passage, return_tensors='pt', max_length=512, truncation=True)

print(query_tokens)
print(passage_tokens)


{'input_ids': tensor([[ 101, 2054, 2003, 1996, 3007, 1997, 2605, 1029,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}
{'input_ids': tensor([[  101,  3000,  2003,  1996,  3007,  1998,  2087, 20151,  2103,  1997,
          2605,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}




# #Generating Embeddings for Query and Passages Using SentenceTransformer

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = model.encode(query, convert_to_tensor=True)
passage_embeddings = model.encode(passages, convert_to_tensor=True)


In [None]:
print("Passages:", passages)  # Checking if passages list is populated


Passages: ['Passage 1 text here.', 'Passage 2 text here.', 'Passage 3 text here.']


# #FAISS-based Passage Retrieval Using SentenceEmbeddings

In [None]:
import faiss
from sentence_transformers import SentenceTransformer


passages = [
    'Passage 1 text here.',
    'Passage 2 text here.',
    'Passage 3 text here.'
]

# Printing the passages to verify they are populated
print("Passages:", passages)

model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the passages
passage_embeddings = model.encode(passages, convert_to_tensor=True)


print("passage_embeddings shape: ", passage_embeddings.shape)

if len(query_embedding.shape) == 1:
    query_embedding = query_embedding.unsqueeze(0)

if len(passage_embeddings.shape) == 1:
    passage_embeddings = passage_embeddings.unsqueeze(0)

# Initializing FAISS index using the correct embedding dimension (from query_embedding)
index = faiss.IndexFlatL2(query_embedding.shape[1])

index.add(passage_embeddings.cpu().numpy())

assert len(passage_embeddings) == len(passages), "Mismatch between passages and embeddings"

# Perform FAISS search to retrieve top k passages
k = 10
_, retrieved_indices = index.search(query_embedding.cpu().numpy(), k)

retrieved_passages = [passages[i] for i in retrieved_indices[0]]

print("Retrieved Passages: ", retrieved_passages)


Passages: ['Passage 1 text here.', 'Passage 2 text here.', 'Passage 3 text here.']




passage_embeddings shape:  torch.Size([3, 384])
Retrieved Passages:  ['Passage 2 text here.', 'Passage 1 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.', 'Passage 3 text here.']


# #Re-ranking Retrieved Passages Using Cross-Encoder

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

cross_encoder = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")
tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-MiniLM-L-12-v2")

inputs = tokenizer([query] * len(retrieved_passages), retrieved_passages, return_tensors='pt', padding=True)
scores = cross_encoder(**inputs).logits
sorted_indices = scores.squeeze().argsort(descending=True)
reranked_passages = [retrieved_passages[i] for i in sorted_indices]


# #Calculating NDCG@10 for Query-Passage Ranking

In [None]:
from sklearn.metrics import ndcg_score

# Example: true relevance scores (e.g., ground truth relevance for query-passage pairs)
true_relevances = [3, 2, 1, 0]  # Ground truth relevance scores for 4 passages

# Example: predicted relevance scores (e.g., model predicted scores for the same passages)
predicted_relevances = [0.9, 0.8, 0.6, 0.3]  # Predicted scores (similarity values)

# Compute NDCG@10 score
ndcg = ndcg_score([true_relevances], [predicted_relevances], k=10)

print(f"NDCG@10: {ndcg}")


NDCG@10: 1.0
