In [None]:
"""
- Load chunked documents from data/processed/chunks.jsonl
- Convert chunks to LangChain Document objects with metadata (page + chunk id)
- Compute embeddings using a local SentenceTransformer model
- Build a FAISS vector index for fast similarity search (top-k retrieval)
 -Save the index to disk for reuse and validated retrieval quality via test queries
 
 """

In [5]:
#Load chunks from disk

import json
from langchain_core.documents import Document

CHUNKS_PATH = "../data/processed/chunks.json"

docs = []
with open(CHUNKS_PATH, "r", encoding="utf-8") as f:
    for line in f:
        r = json.loads(line)
        docs.append(
            Document(
                page_content=r["text"],
                metadata={**r.get("metadata", {}), "chunk_id": r["chunk_id"]}
            )
        )

print("Loaded chunks as Documents:", len(docs))
print("Example metadata:", docs[0].metadata)
print(docs[0].page_content[:250])


Loaded chunks as Documents: 16634
Example metadata: {'producer': 'Atop CHM to PDF Converter', 'creator': 'Atop CHM to PDF Converter', 'creationdate': '2012-06-15T05:44:40+00:00', 'moddate': '2014-04-21T07:53:19+10:00', 'title': 'The Merck Manual of Diagnosis & Therapy, 19th Edition', 'source': '../data/raw/The_Merck_Manual.pdf', 'total_pages': 4114, 'page': 2, 'page_label': 'iii', 'chunk_id': 0}
Table of Contents
1
Front  
  ................................................................................................................................................................................................................
1
Cover  
 


In [None]:
#Note: No OpenAI key needed when using Ollama, but keep the cell here, in case of future integration of `OPENAI_API_KEY`

#Load API key from .env
#import os
#from dotenv import load_dotenv

#load_dotenv()
#assert os.getenv("OPENAI_API_KEY"), "OPENAI_API_KEY not found. Check your .env file."
#print("OPENAI_API_KEY loaded")

In [6]:
# Create embeddings + build FAISS index

from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)

vectorstore = FAISS.from_documents(docs, embeddings)
print("FAISS index built with:", embedding_model_name)


  from .autonotebook import tqdm as notebook_tqdm


FAISS index built with: sentence-transformers/all-MiniLM-L6-v2


In [7]:
import os
INDEX_DIR = "../data/index/faiss_merck_hf"
os.makedirs("../data/index", exist_ok=True)

vectorstore.save_local(INDEX_DIR)
print("Saved FAISS index to:", INDEX_DIR)


Saved FAISS index to: ../data/index/faiss_merck_hf


In [8]:
# Test retrieval

query = "approach to a patient with anemia"
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

hits = retriever.invoke(query)

for i, d in enumerate(hits, 1):
    page = d.metadata.get("page", "?")
    print(f"\n--- Hit {i} (page {page}, chunk_id {d.metadata.get('chunk_id')}) ---")
    print(d.page_content[:400])




--- Hit 1 (page 1056, chunk_id 4399) ---
unless iron or other essential nutrients are depleted.
Evaluation of Anemia
Anemia is not a diagnosis; it is a manifestation of an underlying disorder. Thus, even mild, asymptomatic
anemia should be investigated so that the primary problem can be diagnosed and treated.
Acute or chronic blood loss is the first consideration. The diagnosis usually is based on history,
examination, and a stool test f

--- Hit 2 (page 2802, chunk_id 11923) ---
hemoglobinopathies (using hemoglobin electrophoresis). If these tests are nondiagnostic and there is
no response to empiric treatment, consultation with a hematologist is usually warranted.
• For macrocytic anemias: Evaluation includes serum folate and B
12
 levels.
Treatment
Treatment is directed at reversing the anemia. Transfusion is usually indicated for any anemia if severe
constitutional sym

--- Hit 3 (page 1069, chunk_id 4462) ---
Treatment depends on the cause. For treatment of folate and vitamin B
1