In [5]:
import pathlib

PDF_DIR = pathlib.Path("data/raw")
pdf_paths = sorted(PDF_DIR.glob("*.pdf"))
assert pdf_paths, "No PDFs found - check path"

from langchain_community.document_loaders import UnstructuredPDFLoader, PyPDFLoader
from langchain.docstore.document import Document
from tqdm import tqdm

source_docs = []

for path in tqdm(pdf_paths, desc="Reading PDFs"):
    try:
        loader = PyPDFLoader(str(path))
        pages = list(loader.lazy_load())
    except Exception as e:
        print(f"Failed at {path.name}: {e}")
        continue
    for p in pages:
        # Unstructured returns one Document per page by default
        p.metadata["source"] = path.stem                 # short source name
        p.metadata["page_num"] = p.metadata.get("page")  # keep original page index
    source_docs.extend(pages)

Reading PDFs: 100%|██████████| 13/13 [00:10<00:00,  1.22it/s]


In [6]:
import re

for d in source_docs :
    txt = d.page_content
    txt = re.sub(r"\s+\n", "\n", txt)
    txt = re.sub(r"\n{3,}", "\n\n", txt)
    d.page_content = txt.strip()

In [7]:
from transformers import AutoTokenizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

spliiter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    AutoTokenizer.from_pretrained("thenlper/gte-small"),
    chunk_size = 200,
    chunk_overlap=20,
    add_start_index = True,
    strip_whitespace = True
)

print("Splitting and deduplicating...")

docs_processed, seen = [], set()

for doc in tqdm(source_docs) :
    for chunk in spliiter.split_documents([doc]):
        if chunk.page_content not in seen:
            seen.add(chunk.page_content)
            docs_processed.append(chunk)

Splitting and deduplicating...


100%|██████████| 432/432 [00:03<00:00, 137.12it/s]


The above block is used to process raw text content into FAISS's expected format.

In [8]:
## SANITY CHECK - MAKE SURE THAT OUR CODE WORKED AND DOCUMENTS WERE CREATED

doc = docs_processed[15]
print(type(doc))
print(doc.metadata)
print(doc.page_content[:1000])

<class 'langchain_core.documents.base.Document'>
{'producer': 'iLovePDF', 'creator': 'Acrobat PDFMaker 11 for Word', 'creationdate': '2023-12-06T15:01:50+05:30', 'author': 'Amol Dighe', 'company': '', 'sourcemodified': 'D:20231206092752', 'subject': 'A Roadmap prepared by the Indian Nuclear Physics Communitywith TIFR, Mumbai as the Nodal Scientific Institution', 'title': 'Mega Science Vision – 2035   Nuclear Physics', 'moddate': '2024-01-24T10:12:18+00:00', 'source': 'DST - MSV2035-NP-Final', 'total_pages': 140, 'page': 14, 'page_label': '15', 'page_num': 14, 'start_index': 0}
MEGA SCIENCE VISION – 2035   NUCLEAR PHYSICS
3
THE DRAFTING AND WO RKING GROUPS
Director TIFR, Mumbai –
Dr. Jayaram Chengalur / Dr. S. Ramakrishnan / Dr. Sandip Trivedi Chairperson
Members from the D rafting Group
Dr. Alphonsa Joseph Palakkel, IPR, Gandhinagar Member
Dr. Aradhana Srivastava, BARC, Mumbai Member
Dr. Bedangadas Mohanty, NISER, Bhubaneswar Member
Dr. Rudrajyoti Palit, TIFR, Mumbai Member
Other exper

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.vectorstores.utils import DistanceStrategy

embed = HuggingFaceEmbeddings(model_name="thenlper/gte-small")

vector_db = FAISS.from_documents(docs_processed, embedding=embed, distance_strategy = DistanceStrategy.COSINE)

FAISS.save_local(vector_db, "data/processed/faiss_index")

  embed = HuggingFaceEmbeddings(model_name="thenlper/gte-small")


In [33]:
from rapidfuzz import fuzz, process  # lightweight fuzzy search
from spellchecker import SpellChecker
spell = SpellChecker()

def spellfix(text: str) -> str:
    corrected_words = []
    for word in text.split():
        if word.lower() not in spell:
            corrected = spell.correction(word)
            corrected_words.append(corrected if corrected else word)
        else:
            corrected_words.append(word)
    return " ".join(corrected_words)


class FuzzyRetrieverTool(RetrieverTool):
    def forward(self, query: str):
        query = spellfix(query)     # write your own or use `pyspellchecker`
        return super().forward(query)


In [35]:
from smolagents import Tool
from langchain.vectorstores import VectorStore

class RetrieverTool(Tool):
    name = "retriever"
    description = """
        Using semantic similarity, retrieves some documents from the knowledge base that have the closest embeddings to the input query.
        Always call this tool **before** answering any question that might be answered using documents from the DST / ISRO PDF knowledge base.
        Return the 3-7 most relevant passages.
        """
    
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, vector_db: VectorStore, **kwargs):
        super().__init__(**kwargs)
        self.vector_db = vector_db

    def forward(self, query: str):
        assert isinstance(query, str), "Your search query must be a string"
        cleaned_query = spellfix(query)
        docs = self.vector_db.similarity_search(
            query=cleaned_query,
            k=7
        )
        return "\nRetrieved documents:\n" + "".join(
            [f"===== Document {str(i)} =====\n" + doc.page_content for i, doc in enumerate(docs)]
        )

In [40]:
from smolagents import LiteLLMModel, ToolCallingAgent
import os

model = LiteLLMModel(model_id="gemini/gemini-2.0-flash-lite", api_key=os.getenv('GEMINI_API_KEY'))

retriever_tool = RetrieverTool(vector_db)
agent = ToolCallingAgent(tools=[retriever_tool], model=model, instructions=(
            """You are an answer-bot specialised in the Indian space-science PDFs I just gave you. "
            Workflow:\n
            1. ALWAYS call the `retriever` tool with the user's query.\n
            2. Read its passages, then craft a concise answer.\n
            3. Call `final_answer` with that answer.\n"""
        ))

In [41]:
output = agent.run("What are himalyan heights most suitable for?")

In [24]:
def semantic_search(query: str, k: int = 4):
    docs_and_scores = vector_db.similarity_search_with_score(query, k=k)
    return [(d.page_content[:200], score, d.metadata) for d, score in docs_and_scores]

print(semantic_search("What are himalyan heights most suitable for?"))

[("MENU\nHome \xa0>>\xa0Himalayan heights potentially perfect for India's 'Quantum Leap' to space: Study\nHimalayan heights potentially perfect for India's 'QuantumLeap' to space: Study\nIn a pioneering study for", np.float32(0.35979202), {'producer': 'Skia/PDF m133', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'creationdate': '2025-02-25T18:40:23+00:00', 'title': "Himalayan heights potentially perfect for India's 'Quantum Leap' to space: Study | Department Of Science & Technology", 'moddate': '2025-02-25T18:40:23+00:00', 'source': 'Himalayan heights potentially perfect for India_s _Quantum Leap_ to space_ Study _ Department Of Science & Technology', 'total_pages': 3, 'page': 0, 'page_label': '1', 'page_num': 0, 'start_index': 0}), ('India and the other with Indian contribution), one\nfrom Bulgaria and two from the United States.\na) Indian payloads\nT errain Mapping Camera (TMC), a CCD camera\nthat