In [1]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
# ✅ Full Corrected Hybrid RAG Pipeline in Google Colab (Using BGEM3FlagModel)

# Step 0: Install Everything
!apt-get install -y poppler-utils
!pip install pymupdf
!pip install langchain langchain-community faiss-cpu transformers sentence-transformers
!pip install -U FlagEmbedding

# Step 1: Import Libraries
import faiss
import torch
from FlagEmbedding import BGEM3FlagModel
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import CrossEncoder
import numpy as np
import pandas as pd

# Step 2: Load PDF using PyMuPDFLoader
pdf_path = '/content/drive/My Drive/ProstateCancer_2025.pdf'  # <-- upload your PDF to Colab
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

print(f"Loaded {len(documents)} pages from PDF")

# Step 3: Smart Split and Keep Page Number
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_chunks = []

for doc in documents:
    page_number = doc.metadata.get('page_number', None)
    splits = splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            'text': split,
            'page_number': page_number
        })

print(f"Total {len(all_chunks)} smart chunks created.")

# Step 4: Embed Chunks (Dense + Sparse) with Correct Model
embed_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

texts_for_embedding = [chunk['text'] for chunk in all_chunks]

outputs = embed_model.encode(
    texts_for_embedding,
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=False
)

dense_embeds = outputs["dense_vecs"]         # (n_chunks, 1024)

sparse_embeds = outputs["lexical_weights"]

# Step 5: Build FAISS Index for Dense
embedding_dim = dense_embeds.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(dense_embeds)

# Save sparse vectors separately
sparse_index = sparse_embeds

# Step 6: Load Cross-Encoder Reranker
reranker = CrossEncoder('BAAI/bge-reranker-large')




Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 34 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.7 [186 kB]
Fetched 186 kB in 1s (137 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 126101 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.7_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.7) ...
Setting up poppler-utils (22.02.0-2ubuntu0.7) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinu

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

bm25.jpg:   0%|          | 0.00/132k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/15.8k [00:00<?, ?B/s]

colbert_linear.pt:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

.DS_Store:   0%|          | 0.00/6.15k [00:00<?, ?B/s]

mkqa.jpg:   0%|          | 0.00/608k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/485k [00:00<?, ?B/s]

others.webp:   0%|          | 0.00/21.0k [00:00<?, ?B/s]

long.jpg:   0%|          | 0.00/127k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

nqa.jpg:   0%|          | 0.00/158k [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/725k [00:00<?, ?B/s]

model.onnx_data:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

Constant_7_attr__value:   0%|          | 0.00/65.6k [00:00<?, ?B/s]

miracl.jpg:   0%|          | 0.00/576k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/698 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.27G [00:00<?, ?B/s]

sparse_linear.pt:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

pre tokenize: 100%|██████████| 8/8 [00:00<00:00, 62.22it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 8/8 [00:04<00:00,  1.71it/s]


config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [4]:
# Step 7: Load DeepSeek LLM

model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Step 8: Define Query and Embed
user_query = "What are the treatment for prostate cancer?"

query_outputs = embed_model.encode(
    [user_query],
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=False
)

query_dense = query_outputs["dense_vecs"]
query_sparse = query_outputs["lexical_weights"][0]  # Only one query

# Step 9: Dense Retrieval
D, I = faiss_index.search(query_dense, k=10)  # top-10 by dense

# Step 10: Sparse Retrieval (Simple Dot Product)
def sparse_dot(a, b):
    score = 0.0
    for token_id, weight in a.items():
        if token_id in b:
            score += weight * b[token_id]
    return score

sparse_scores = []
for idx in range(len(sparse_index)):
    score = sparse_dot(query_sparse, sparse_index[idx])
    sparse_scores.append(score)

# Step 11: Combine Dense + Sparse
hybrid_scores = []

for idx in I[0]:
    dense_score = -D[0][list(I[0]).index(idx)]  # FAISS returns distance (L2), so invert it
    sparse_score = sparse_scores[idx]
    hybrid_score = 0.5 * dense_score + 0.5 * sparse_score  # simple average
    hybrid_scores.append((idx, hybrid_score))

# Sort by hybrid score
hybrid_sorted = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

final_top_chunks = [all_chunks[idx] for idx, _ in hybrid_sorted[:3]]  # Pick top 3

# Step 12: Rerank with Cross-Encoder
pairs = [(user_query, chunk['text']) for chunk in final_top_chunks]
scores = reranker.predict(pairs)
sorted_indices = np.argsort(scores)[::-1]
final_reranked_chunks = [final_top_chunks[i] for i in sorted_indices]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 8 files:   0%|          | 0/8 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not in

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [5]:
# Step 13: Build Evaluation Inputs
# You need queries dict, corpus dict, relevant_docs dict

# For simplicity, let's define a synthetic query set for testing
# test_queries = [
#     "What are the five clinical/pathologic features required to classify a prostate cancer patient as very low risk?",
#     "Which imaging modalities are considered acceptable alternatives to conventional bone scans for detecting micrometastatic disease?",
#     "What is the standard PSA threshold used to define biochemical recurrence after EBRT?",
#     "Which therapies are recommended for mCRPC patients with pathogenic BRCA1 or BRCA2 mutation?",
#     "What updates were made regarding the use of Lu-177–PSMA-617?",
#     "When is active surveillance not recommended for favorable intermediate-risk prostate cancer?",
#     "Summarize the criteria that distinguish high-risk from very-high-risk prostate cancer.",
#     "In patients with PSA persistence or recurrence after radical prostatectomy, how is management stratified?"
# ]

query_id_mapping = {}     # qid -> query text
relevant_docs = {}        # qid -> set of reranked top document ids
# Step 1: Retrieve and store texts
retrieved_texts_per_query = {}


# Load ground truth cleaned pairs
ground_truth_df = pd.read_csv("/content/drive/My Drive/batch_outputs/cleaned_ground_truth_pairs.csv")

# Prepare test queries
test_queries = ground_truth_df['question'].tolist()
ground_truth_answers = ground_truth_df['answers'].tolist()


for idx, user_query in enumerate(test_queries):
    qid = f"q{idx+1}"
    query_id_mapping[qid] = user_query

    # Embed query
    query_outputs = embed_model.encode(
        [user_query],
        return_dense=True,
        return_sparse=True,
        return_colbert_vecs=False
    )

    query_dense = query_outputs["dense_vecs"]
    query_sparse = query_outputs["lexical_weights"][0]  # only one query

    # Dense retrieval
    D, I = faiss_index.search(query_dense, k=20)  # Retrieve (20) before rerank

    # Sparse scoring
    sparse_scores = []
    for idx2 in range(len(sparse_index)):
        score = sparse_dot(query_sparse, sparse_index[idx2])
        sparse_scores.append(score)

    # Hybrid scoring
    hybrid_scores = []
    for idx2 in I[0]:
        dense_score = -D[0][list(I[0]).index(idx2)]  # FAISS returns L2 distance
        sparse_score = sparse_scores[idx2]
        hybrid_score = 0.5 * dense_score + 0.5 * sparse_score
        hybrid_scores.append((idx2, hybrid_score))

    # Sort by hybrid
    hybrid_sorted = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

    # Prepare pairs for reranker
    rerank_candidates = [all_chunks[idx2]['text'] for idx2, _ in hybrid_sorted]

    pairs = [(user_query, passage) for passage in rerank_candidates]
    rerank_scores = reranker.predict(pairs)
    reranked = sorted(zip(hybrid_sorted, rerank_scores), key=lambda x: x[1], reverse=True)

    # After reranking
    top_k = 3
    final_top_reranked = reranked[:top_k]  # list of (hybrid_score, rerank_score)

    # Extract both corpus IDs and actual texts
    final_top_indices = [str(idx2_score[0][0]) for idx2_score in final_top_reranked]

    final_top_texts = [all_chunks[idx2_score[0][0]]['text'] for idx2_score in final_top_reranked]
    retrieved_texts_per_query[qid] = final_top_texts

    # Save
    relevant_docs[qid] = set(final_top_indices)  # <-- for ID-matching evaluation



In [6]:
#from here we are using query-passage matching pairs
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

similarity_model = SentenceTransformer('all-MiniLM-L6-v2')

def match_rank(retrieved_texts, ground_truth_text, threshold=0.7):
    embeddings = similarity_model.encode([ground_truth_text] + retrieved_texts)
    ground_emb = embeddings[0]
    retrieved_embs = embeddings[1:]
    sims = cosine_similarity([ground_emb], retrieved_embs)[0]

    for idx, sim in enumerate(sims):
        if sim >= threshold:
            return idx + 1   # 1-based indexing
    return None


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# top_k = 5  # final top-k reranked passages to consider
# hybrid_top_n = 20  # how many to hybrid retrieve before rerank
# threshold = 0.7  # cosine similarity threshold
gold_query_passage_pairs = [
    {
        "query": row["question"],
        "answer_excerpt": row["answers"]
    }
    for _, row in ground_truth_df.iterrows()
]

# Step 2: Evaluate using saved retrieval
recall_at_k = []
precision_at_k = []
reciprocal_ranks = []

for idx, pair in enumerate(gold_query_passage_pairs):
    user_query = pair["query"]
    true_answer_excerpt = pair["answer_excerpt"]

    final_retrieved_texts = retrieved_texts_per_query[f"q{idx+1}"]  # or match qid

    rank = match_rank(final_retrieved_texts, true_answer_excerpt)

    if rank is not None:
        recall_at_k.append(1)
        reciprocal_ranks.append(1.0 / rank)
        precision_at_k.append(1 / rank)
    else:
        recall_at_k.append(0)
        reciprocal_ranks.append(0.0)
        precision_at_k.append(0.0)






In [11]:
mean_recall_at_k = np.mean(recall_at_k)
mean_precision_at_k = np.mean(precision_at_k)
mean_mrr_at_k = np.mean(reciprocal_ranks)

print(f"Recall@{top_k}: {mean_recall_at_k:.4f}")
print(f"Precision@{top_k}: {mean_precision_at_k:.4f}")
print(f"MRR@{top_k}: {mean_mrr_at_k:.4f}")


Recall@3: 0.5000
Precision@3: 0.4821
MRR@3: 0.4821


In [12]:
from sklearn.metrics import ndcg_score

all_y_true = []
all_y_scores = []

for idx, pair in enumerate(gold_query_passage_pairs):
    user_query = pair["query"]
    true_answer_excerpt = pair["answer_excerpt"]

    final_retrieved_texts = retrieved_texts_per_query[f"q{idx+1}"]

    ground_emb = similarity_model.encode(true_answer_excerpt)
    retrieved_embs = similarity_model.encode(final_retrieved_texts)

    sims = cosine_similarity([ground_emb], retrieved_embs)[0]
    threshold=0.7
    # Mark as 1 if sim >= threshold, else 0 (binary relevance)
    relevance = [1 if sim >= threshold else 0 for sim in sims]
    all_y_true.append(relevance)

    # Use similarity scores directly as ranking scores
    all_y_scores.append(list(sims))

# Now compute NDCG@k
k = 5  # or whatever top-k you want
mean_ndcg_at_k = ndcg_score(all_y_true, all_y_scores, k=k)

print(f"NDCG@{k}: {mean_ndcg_at_k:.4f}")


NDCG@5: 0.5000
