In [None]:
from google.colab import drive

drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# ✅ Full Corrected Hybrid RAG Pipeline in Google Colab (Using BGEM3FlagModel)

# Step 0: Install Everything
!apt-get install -y poppler-utils
!pip install pymupdf
!pip install langchain langchain-community faiss-cpu transformers sentence-transformers
!pip install -U FlagEmbedding

# Step 1: Import Libraries
import faiss
import torch
from FlagEmbedding import BGEM3FlagModel
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import CrossEncoder
import numpy as np

# Step 2: Load PDF using PyMuPDFLoader
pdf_path = '/content/drive/My Drive/ProstateCancer_2025.pdf'  # <-- upload your PDF to Colab
loader = PyMuPDFLoader(pdf_path)
documents = loader.load()

print(f"Loaded {len(documents)} pages from PDF")

# Step 3: Smart Split and Keep Page Number
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
all_chunks = []

for doc in documents:
    page_number = doc.metadata.get('page_number', None)
    splits = splitter.split_text(doc.page_content)
    for split in splits:
        all_chunks.append({
            'text': split,
            'page_number': page_number
        })

print(f"Total {len(all_chunks)} smart chunks created.")

# Step 4: Embed Chunks (Dense + Sparse) with Correct Model
embed_model = BGEM3FlagModel('BAAI/bge-m3', use_fp16=True)

texts_for_embedding = [chunk['text'] for chunk in all_chunks]

outputs = embed_model.encode(
    texts_for_embedding,
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=False
)

dense_embeds = outputs["dense_vecs"]         # (n_chunks, 1024)

sparse_embeds = outputs["lexical_weights"]

# Step 5: Build FAISS Index for Dense
embedding_dim = dense_embeds.shape[1]
faiss_index = faiss.IndexFlatL2(embedding_dim)
faiss_index.add(dense_embeds)

# Save sparse vectors separately
sparse_index = sparse_embeds

# Step 6: Load Cross-Encoder Reranker
reranker = CrossEncoder('BAAI/bge-reranker-large')

# Step 7: Load DeepSeek LLM

model_name = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
llm = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=True
)

# Step 8: Define Query and Embed
user_query = "What are the treatment for prostate cancer?"

query_outputs = embed_model.encode(
    [user_query],
    return_dense=True,
    return_sparse=True,
    return_colbert_vecs=False
)

query_dense = query_outputs["dense_vecs"]
query_sparse = query_outputs["lexical_weights"][0]  # Only one query

# Step 9: Dense Retrieval
D, I = faiss_index.search(query_dense, k=20)  # top-20 by dense

# Step 10: Sparse Retrieval (Simple Dot Product)
def sparse_dot(a, b):
    score = 0.0
    for token_id, weight in a.items():
        if token_id in b:
            score += weight * b[token_id]
    return score

sparse_scores = []
for idx in range(len(sparse_index)):
    score = sparse_dot(query_sparse, sparse_index[idx])
    sparse_scores.append(score)

# Step 11: Combine Dense + Sparse
hybrid_scores = []

for idx in I[0]:
    dense_score = -D[0][list(I[0]).index(idx)]  # FAISS returns distance (L2), so invert it
    sparse_score = sparse_scores[idx]
    hybrid_score = 0.5 * dense_score + 0.5 * sparse_score  # simple average
    hybrid_scores.append((idx, hybrid_score))

# Sort by hybrid score
hybrid_sorted = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

final_top_chunks = [all_chunks[idx] for idx, _ in hybrid_sorted[:3]]  # Pick top 3

# Step 12: Rerank with Cross-Encoder
pairs = [(user_query, chunk['text']) for chunk in final_top_chunks]
scores = reranker.predict(pairs)
sorted_indices = np.argsort(scores)[::-1]
final_reranked_chunks = [final_top_chunks[i] for i in sorted_indices]


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.7).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Loaded 223 pages from PDF
Total 1895 smart chunks created.


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

pre tokenize: 100%|██████████| 8/8 [00:00<00:00, 19.52it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 8/8 [00:12<00:00,  1.50s/it]


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



===== FINAL ANSWER =====


You are a medical research assistant.
Use the following report excerpts to answer the question concisely and accurately. Always cite page numbers.

Report Excerpts:
(Page None) Radiopharmaceutical Therapy
• Radiopharmaceutical therapies for prostate cancer are suitable options for improving survival and/or PFS in select patients with advanced castration-
resistant disease. Due to prior therapy exposure, specific targets, and hematologic effects of these therapies, careful selection and sequencing strategy
with other therapies is important. This section discusses the two currently FDA-approved agents in use (Ra-223, Lu-177–PSMA-617).

(Page None) routine primary therapy for localized prostate cancer due to lack of long-
term data comparing these treatments to radiation or radical 
prostatectomy. At this time, the panel recommends only cryosurgery and 
high-intensity focused ultrasound (HIFU; category 2B) as local therapy 
options for RT recurrence in the abse

In [None]:

# Step 13: Build Final Prompt with Citations
context = ""
for chunk in final_reranked_chunks:
    context += f"(Page {chunk['page_number']}) {chunk['text']}\n\n"

final_prompt = f"""
You are a medical research assistant.
Use the following report excerpts to answer the question concisely and accurately. Always cite page numbers.

Report Excerpts:
{context}

Question: {user_query}
Answer:
"""

# Step 14: Generate Answer with DeepSeek
input_ids = tokenizer(final_prompt, return_tensors="pt").input_ids.to("cuda")
outputs = llm.generate(input_ids, max_new_tokens=300)

answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("\n===== FINAL ANSWER =====\n")
print(answer)