In [None]:
import numpy as np
import pickle

## Example Dataset: PubMedQA
The dataset for Medical Information
- each context is small enough --> no chunking

In [31]:
from datasets import load_dataset

In [3]:
# ds = load_dataset("qiaojin/PubMedQA", "pqa_artificial")['train']
ds_human = load_dataset("qiaojin/PubMedQA", "pqa_labeled")['train']

In [5]:
# contexts_artificial = [sent for item in ds['context'] for sent in item['contexts']]
contexts_human = [sent for item in ds_human['context'] for sent in item['contexts']]
print(len(contexts_human))

3358


## BM25

In [6]:
from rank_bm25 import BM25Okapi
tokenized_corpus_human = [doc.split(" ") for doc in contexts_human]
bm25 = BM25Okapi(tokenized_corpus_human)

In [15]:
query = ds_human['question'][0]
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
results = bm25.get_top_n(tokenized_query, contexts_human, n=5)
print("Query:", query)
print("Top 5 results:")
for idx, doc in enumerate(results):
    print(f"{idx+1}: {doc}")

# get top 100 documents indices
# top_n = 1000
# top_n_indices = np.argsort(doc_scores)[-top_n:][::-1]
#print(top_n_indices)



Query: Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
Top 5 results:
1: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.
2: The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and ce

## Sentence Encoder

In [8]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss

### Bi-Encoder

In [10]:
model = SentenceTransformer("google/embeddinggemma-300m").to("mps")

In [None]:
embedding_human = model.encode(contexts_human)
dimension = embedding_human.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embedding_human, dtype='float32'))
# save faiss index
faiss.write_index(index, "faiss_pubmedqa_labeled.index")

In [12]:
query = ds_human['question'][0]
embedding_duery = model.encode(query).astype('float32')

In [13]:
k = 5
distances, indices = index.search(np.array([embedding_duery]), k)

In [14]:
print("Query: ", query)
for i, idx in enumerate(indices[0]):
    print(f"Result {i+1}: {contexts_human[idx]} (Distance: {distances[0][i]})")

Query:  Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
Result 1: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. (Distance: 0.7628254890441895)
Result 2: The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in ear

### Cross-Encoder

In [None]:
model = CrossEncoder("BSC-NLP4BIA/Medprocner-CE-Reranker").to("mps")

In [21]:
# Get scores for pairs of texts
# use only all documents to save time
query = ds_human['question'][0]

pairs = [[query, contexts_human[idx]] for idx in indices[0]]
scores = model.predict(pairs)

In [23]:
# Get the top 5 results
top_k = 5
top_k_indices = np.argsort(scores)[-top_k:][::-1]
print("Top 5 Reranked Results:")
for rank, index in enumerate(top_k_indices):
    doc_index = indices[0][index]
    print(f"Rank {rank+1}: {contexts_human[doc_index]} (Score: {scores[index]})")

Top 5 Reranked Results:
Rank 1: Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants. (Score: 0.9535653591156006)
Rank 2: The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD), and cells in late stages of PCD (LPCD). Window stage leav

## Hybrid Search

In [1]:
from datasets import load_dataset
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer, CrossEncoder
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("qiaojin/PubMedQA", "pqa_artificial")['train']
contexts = [sent for item in ds['context'] for sent in item['contexts']]

In [6]:
# bm25
tokenized_corpus = [doc.split(" ") for doc in contexts]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
# embedding [do first to reduce retrieval time]
model = SentenceTransformer("google/embeddinggemma-300m").to("cuda")
try:
    embedding = pickle.load(open("embedding_pubmedqa_artificial.pkl", "rb"))
except:
    embedding = model.encode(contexts)
    with open("embedding_pubmedqa_artificial.pkl", "wb") as f:
        pickle.dump(embedding, f)

In [15]:
query = ds['question'][0]
print("Query:", query)

Query: Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?


In [None]:
# bm25 retrieval
tokenized_query = query.split(" ")
doc_scores = bm25.get_scores(tokenized_query)
top_n = 1000
top_n_indices_bm25 = np.argsort(doc_scores)[-top_n:][::-1]

In [None]:
# faiss retrieval
embedding_duery = model.encode(query).astype('float32')
# only search top_n documents from bm25
dimension = embedding.shape[1]
faiss_index = faiss.IndexFlatL2(dimension)
faiss_index.add(np.array(embedding[top_n_indices_bm25], dtype='float32'))
k = 50

distances, indices = faiss_index.search(np.array([embedding_duery]), k)

In [16]:
# cross encoder reranking
cross_model = CrossEncoder("BSC-NLP4BIA/Medprocner-CE-Reranker").to("cuda")

In [None]:
# only rerank top_k documents from faiss
top_k = 5

docs = [contexts[top_n_indices_bm25[indices[0][i]]] for i in range(k)]
pairs = [[query, doc] for doc in docs]
scores = cross_model.predict(pairs)
top_k_indices = np.argsort(scores)[-top_k:][::-1]
print("Top 5 Reranked Results:")
for rank, index in enumerate(top_k_indices):
    doc_index = indices[0][index]
    print(f"Rank {rank+1}: {docs[index]} (Score: {scores[index]})")