In [2]:
### Biomedical

In [None]:
                ┌────────────────────┐
                │   User Query       │
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │   Embed Query      │ ◄─ Using dense embedding model (e.g., SBERT, OpenAI)
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │ Vector Search (kNN)│ ◄─ In vector store (e.g., FAISS, Pinecone)
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │ Retrieve Top-k Docs│
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │  Format Context    │
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │  Prompt LLM (RAG)  │ ◄─ Append query + context
                └────────┬───────────┘
                         │
                         ▼
                ┌────────────────────┐
                │   Generated Answer │
                └────────────────────┘


In [8]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
import os

# 1. Load and split your documents
loader = TextLoader("data/brca1_cancer_links.txt")  # or use DirectoryLoader
documents = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# 2. Embed documents and create vector store
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.from_documents(chunks, embeddings)

# 3. Define the retriever
retriever = vector_db.as_retriever(search_type="similarity", k=4)

# 4. Setup Ollama LLM
llm = Ollama(model="llama3:instruct", temperature=0)

# 5. Create RAG chain 
rag_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

# 5. Ask a question
query = "How does BRCA1 mutation create a therapeutic opportunity for cancer treatment?"
result = rag_chain.invoke(query)

# 6. Show result
print("Answer:")
print(result["result"])

print("\n📚 Source Chunks Used:")
for doc in result["source_documents"]:
    print(f"- {doc.page_content[:150]}...")  # truncate for readability


Answer:
According to the text, a BRCA1 mutation creates a therapeutic opportunity by impairing homologous recombination repair and leading to genomic instability. This vulnerability can be exploited with PARP inhibitors, such as Olaparib, which accumulate DNA damage in BRCA1-deficient cells, resulting in cell death.

📚 Source Chunks Used:
- BRCA1 is a tumor suppressor gene that plays a critical role in the repair of DNA double-strand breaks through homologous recombination repair. Mutatio...
- The ATM gene is another DNA damage response gene that works upstream of BRCA1. Mutations in ATM can also sensitize tumors to DNA-damaging agents.

Che...
- PARP inhibitors, such as Olaparib, are targeted cancer therapies that exploit synthetic lethality. In BRCA1-deficient cells, inhibition of PARP leads ...
- RAD51 is recruited by BRCA2 during homologous recombination repair and plays a key role in strand invasion and exchange. A disruption in BRCA2 impairs...
