In [1]:
# --- Import necessary libraries ---
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader
import os
from collections import defaultdict

# --- Step 1: Define your ROOT directory (IMPORTANT) ---
pdf_directory = "NcertData"

# --- Step 2: Load ALL PDFs recursively using PyMuPDFLoader ---
print("\nüìÑ [OBSERVE] Loading all PDF files recursively...\n")

pdf_loader = DirectoryLoader(
    path=pdf_directory,
    glob="**/*.pdf",              # ‚úÖ FIXED: recursive glob
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

pdf_docs = pdf_loader.load()

print(f"‚úÖ Loaded {len(pdf_docs)} PDF pages from '{pdf_directory}'\n")

# --- Step 3: Normalize paths + group pages by PDF (debug only) ---
pdf_files = defaultdict(list)

for doc in pdf_docs:
    # Normalize Windows paths
    src = doc.metadata.get("source", "Unknown").replace("\\", "/")
    pdf_files[src].append(doc)

# --- Step 4: Preview loaded PDFs (sanity check) ---
print("üìò --- PDF File Previews (Debug) ---\n")

for file_path, docs in list(pdf_files.items())[:5]:  # preview first 5 PDFs
    file_name = os.path.basename(file_path)
    print(f"üìÑ File: {file_name}")
    print(f"   Pages loaded: {len(docs)}")

    preview_text = docs[0].page_content.strip().replace("\n", " ")
    preview_text = preview_text[:400] + ("..." if len(preview_text) > 400 else "")
    print(f"   üìù Preview: {preview_text}")
    print("-" * 100)


  from .autonotebook import tqdm as notebook_tqdm



üìÑ [OBSERVE] Loading all PDF files recursively...



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44/44 [00:03<00:00, 14.08it/s]

‚úÖ Loaded 748 PDF pages from 'NcertData'

üìò --- PDF File Previews (Debug) ---

üìÑ File: hecu101.pdf
   Pages loaded: 7
   üìù Preview: Chapter 1‚Äâ‚Äî‚ÄâExploring the Investigative World of Science 1 Exploring the  Investigative  World of Science 1 Dear Young Scientists,  Welcome back! On the first page of each chapter, you will find a set of questions.  These are not meant for any exam‚Äî‚Äâthey are unique invitations to spark your  curiosity to explore the world of science! Why is one side of a puri thinner than the other? Are there more...
----------------------------------------------------------------------------------------------------
üìÑ File: hecu102.pdf
   Pages loaded: 20
   üìù Preview: 8 Curiosity ‚Äî Textbook of Science for Grade 8 2 The Invisible Living  World: Beyond Our  Naked Eye Probe and ponder 	 z Have you ever wondered what you might see if the invisible  world around you became visible? 	 z How do you think your observation of this hidden world might  ch




In [3]:
from collections import Counter

classes = []
for doc in pdf_docs:
    src = doc.metadata["source"].replace("\\", "/")
    for part in src.split("/"):
        if part.lower().startswith("class"):
            classes.append(part)

Counter(classes)



Counter({'Class 8th': 498, 'Class 9th': 250})

In [4]:
for i, doc in enumerate(pdf_docs[:50]):  # first 10 for safety
    print(f"\n--- Document {i+1} ---")
    print(doc.metadata)



--- Document 1 ---
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-06-28T11:38:46+05:30', 'source': 'NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'file_path': 'NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-07-23T15:34:29+05:30', 'trapped': '', 'modDate': "D:20250723153429+05'30'", 'creationDate': "D:20250628113846+05'30'", 'page': 0}

--- Document 2 ---
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-06-28T11:38:46+05:30', 'source': 'NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'file_path': 'NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2025-07-23T15:34:29+05:30', 'trapped': '', 'modDate': "D:20250723153429+05'30'", 'creationDate': "D:20250628113846+05'30'", 'page

In [8]:
metadata_keys = set()

for doc in pdf_docs:
    metadata_keys.update(doc.metadata.keys())

print("Metadata keys found:", metadata_keys)


Metadata keys found: {'author', 'creator', 'subject', 'producer', 'keywords', 'trapped', 'page', 'moddate', 'file_path', 'creationdate', 'total_pages', 'source', 'title', 'modDate', 'format', 'creationDate'}


In [9]:
from collections import Counter

classes = []

for doc in pdf_docs:
    src = doc.metadata["source"].replace("\\", "/")
    for part in src.split("/"):
        if part.lower().startswith("class"):
            classes.append(part)

Counter(classes)


Counter({'Class 8th': 498, 'Class 9th': 250})

In [12]:


import os

root = "C:/Users/Sujal/PROJECTS/MiniProject/NcertData"

for item in os.listdir(root):
    print(item)


Class 8th
Class 9th


In [14]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

pdf_directory = r"C:\Users\Sujal\PROJECTS\MiniProject\NcertData"  # üëà SAME as os.listdir

loader = DirectoryLoader(
    path=pdf_directory,
    glob="**/*.pdf",          # recursive
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

pdf_docs = loader.load()

print(f"Total pages loaded: {len(pdf_docs)}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44/44 [00:02<00:00, 16.76it/s]

Total pages loaded: 748





In [15]:
for doc in pdf_docs[:10]:
    print(doc.metadata["source"])


C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu102.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu102.pdf
C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu102.pdf


In [16]:
import os

root = r"C:\Users\dande\BASE\NcertData"

for dirpath, dirnames, filenames in os.walk(root):
    print("\nDIR:", dirpath)
    for f in filenames:
        print("  FILE:", f)


In [17]:
import os

class8_pdfs = []

for dirpath, _, filenames in os.walk(r"C:\Users\dande\BASE\NcertData"):
    if "class8" in dirpath.lower():
        for f in filenames:
            if f.lower().endswith(".pdf"):
                class8_pdfs.append(os.path.join(dirpath, f))

print("Class 8 PDFs found:", len(class8_pdfs))
for p in class8_pdfs[:5]:
    print(p)


Class 8 PDFs found: 0


In [19]:
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

pdf_directory = r"C:\Users\Sujal\PROJECTS\MiniProject\NcertData"

loader = DirectoryLoader(
    path=pdf_directory,
    glob="**/*.*",              # load EVERYTHING
    loader_cls=PyMuPDFLoader,
    show_progress=True
)

all_docs = loader.load()

# keep only PDFs (case‚Äëinsensitive)
pdf_docs = [
    doc for doc in all_docs
    if doc.metadata["source"].lower().endswith(".pdf")
]

print(f"Total PDF pages loaded: {len(pdf_docs)}")


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 44/44 [00:02<00:00, 16.52it/s]

Total PDF pages loaded: 748





In [20]:
from collections import Counter

classes = []
for doc in pdf_docs:
    src = doc.metadata["source"].replace("\\", "/")
    for part in src.split("/"):
        if part.lower().startswith("class"):
            classes.append(part)

print(Counter(classes))


Counter({'Class 8th': 498, 'Class 9th': 250})


In [21]:
unique_sources = sorted({
    doc.metadata["source"].replace("\\", "/")
    for doc in pdf_docs
})

print(f"Total unique PDF files: {len(unique_sources)}\n")

for src in unique_sources:
    print(src)


Total unique PDF files: 44

C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu101.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu102.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu103.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu104.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu105.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu106.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu107.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu108.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu109.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu110.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu111.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Science/hecu112.pdf
C:/Users/Sujal/PROJECTS/MiniProject/NcertData/Class 8th/Scie

In [22]:
for i, doc in enumerate(pdf_docs[:]):
    print(f"\n--- Document {i+1} ---")
    for k, v in doc.metadata.items():
        print(f"{k}: {v}")




--- Document 1 ---
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 20.0 (Windows)
creationdate: 2025-06-28T11:38:46+05:30
source: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
file_path: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
total_pages: 7
format: PDF 1.4
title: 
author: 
subject: 
keywords: 
moddate: 2025-07-23T15:34:29+05:30
trapped: 
modDate: D:20250723153429+05'30'
creationDate: D:20250628113846+05'30'
page: 0

--- Document 2 ---
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 20.0 (Windows)
creationdate: 2025-06-28T11:38:46+05:30
source: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
file_path: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
total_pages: 7
format: PDF 1.4
title: 
author: 
subject: 
keywords: 
moddate: 2025-07-23T15:34:29+05:30
trapped: 
modDate: D:20250723153429+05'30'
creationDate: D:20250628113846+05'30'
page: 1

--- Document 3 

In [23]:
def enrich_metadata(doc):
    path = doc.metadata["source"].replace("\\", "/")
    parts = path.split("/")

    class_name = None
    subject = None

    for i, part in enumerate(parts):
        if part.lower().startswith("class"):
            class_name = part
            subject = parts[i + 1] if i + 1 < len(parts) else None
            break

    # language from filename
    filename = parts[-1].lower()
    if "_en" in filename:
        language = "en"
    elif "_hi" in filename or "_hn" in filename:
        language = "hi"
    else:
        language = "unknown"

    doc.metadata.update({
        "class": class_name,
        "subject": subject,
        "language": language,
        "board": "NCERT",
        "content_type": "textbook"
    })

    return doc


In [24]:
enriched_docs = [enrich_metadata(doc) for doc in pdf_docs]


In [25]:
from collections import Counter

Counter(doc.metadata["class"] for doc in enriched_docs)
Counter(doc.metadata["subject"] for doc in enriched_docs)
Counter(doc.metadata["language"] for doc in enriched_docs)



Counter({'unknown': 748})

In [26]:
enriched_docs = [enrich_metadata(doc) for doc in pdf_docs]


In [27]:
for i, doc in enumerate(enriched_docs[:]):
    print(f"\n--- Enriched Document {i+1} ---")
    for k, v in doc.metadata.items():
        print(f"{k}: {v}")



--- Enriched Document 1 ---
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 20.0 (Windows)
creationdate: 2025-06-28T11:38:46+05:30
source: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
file_path: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
total_pages: 7
format: PDF 1.4
title: 
author: 
subject: Hindi
keywords: 
moddate: 2025-07-23T15:34:29+05:30
trapped: 
modDate: D:20250723153429+05'30'
creationDate: D:20250628113846+05'30'
page: 0
class: Class 9th
language: unknown
board: NCERT
content_type: textbook

--- Enriched Document 2 ---
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 20.0 (Windows)
creationdate: 2025-06-28T11:38:46+05:30
source: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
file_path: C:\Users\Sujal\PROJECTS\MiniProject\NcertData\Class 9th\Hindi\hecu101.pdf
total_pages: 7
format: PDF 1.4
title: 
author: 
subject: Hindi
keywords: 
moddate: 2025-07-23T15:34:29+05:30
trappe

In [28]:
## Cleaning the data

import re
from langchain_core.documents import Document

def clean_text(text: str) -> str:
    # Remove excessive newlines
    text = re.sub(r"\n{2,}", "\n", text)

    # Remove multiple spaces or tabs
    text = re.sub(r"[ \t]{2,}", " ", text)

    # Remove page numbers like "Page 12"
    text = re.sub(r"Page\s+\d+", "", text, flags=re.IGNORECASE)

    # Remove stray non-text symbols
    text = re.sub(r"[‚Ä¢‚ñ†‚ñ™‚óÜ‚ñ∫]", " ", text)

    return text.strip()


In [29]:
processed_docs = []

for doc in enriched_docs:
    cleaned_text = clean_text(doc.page_content)

    processed_docs.append(
        Document(
            page_content=cleaned_text,
            metadata=doc.metadata  # metadata stays untouched
        )
    )

print(f"Processed documents: {len(processed_docs)}")


Processed documents: 748


In [30]:
print("BEFORE:")
print(enriched_docs[0].page_content[:500])

print("\nAFTER:")
print(processed_docs[0].page_content[:500])


BEFORE:
Chapter 1‚Äâ‚Äî‚ÄâExploring the Investigative World of Science
1
Exploring the 
Investigative 
World of Science
1
Dear Young Scientists, 
Welcome back! On the first page of each chapter, you will find a set of questions. 
These are not meant for any exam‚Äî‚Äâthey are unique invitations to spark your 
curiosity to explore the world of science!
Why is one side of a puri thinner than the other?
Are there more grains of sand on all the beaches and deserts of the world, or more 
stars in our galaxy?
Right fro

AFTER:
Chapter 1‚Äâ‚Äî‚ÄâExploring the Investigative World of Science
1
Exploring the 
Investigative 
World of Science
1
Dear Young Scientists, 
Welcome back! On the first page of each chapter, you will find a set of questions. 
These are not meant for any exam‚Äî‚Äâthey are unique invitations to spark your 
curiosity to explore the world of science!
Why is one side of a puri thinner than the other?
Are there more grains of sand on all the beaches and deserts of the world, or

In [31]:
## Recursive Character Text Splitter 
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=450,      # ideal for textbooks
    chunk_overlap=90,    # preserves context
    separators=["\n\n", "\n", ".", " ", ""]
)

chunked_docs = text_splitter.split_documents(processed_docs)

print(f"Total chunks created: {len(chunked_docs)}")


Total chunks created: 4238


In [32]:
print(chunked_docs[2400].page_content[:300])
print(chunked_docs[2400].metadata)


interactions among the biotic components. Both 
types of interactions‚Äâ‚Äî‚Äâamong biotic components, 
and between biotic and abiotic components‚Äâ‚Äî‚Äâare 
important for survival in any habitat.
Fig. 12.5: Biotic and abiotic interactions
Fig. 12.4: Fish have indirect effect on plants 
in and around ponds. Th
{'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-06-28T18:09:21+05:30', 'source': 'C:\\Users\\Sujal\\PROJECTS\\MiniProject\\NcertData\\Class 8th\\Science\\hecu112.pdf', 'file_path': 'C:\\Users\\Sujal\\PROJECTS\\MiniProject\\NcertData\\Class 8th\\Science\\hecu112.pdf', 'total_pages': 20, 'format': 'PDF 1.6', 'title': '', 'author': '', 'subject': 'Science', 'keywords': '', 'moddate': '2025-07-23T15:34:37+05:30', 'trapped': '', 'modDate': "D:20250723153437+05'30'", 'creationDate': "D:20250628180921+05'30'", 'page': 5, 'class': 'Class 8th', 'language': 'unknown', 'board': 'NCERT', 'content_type': 'textbook'}


In [33]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name="intfloat/multilingual-e5-large",
    encode_kwargs={
        "normalize_embeddings": True
    }
)

print("‚úÖ multilingual-e5-large embeddings loaded")


  embeddings = HuggingFaceEmbeddings(
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Error while downloading from https://huggingface.co/intfloat/multilingual-e5-large/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading from https://huggingface.co/intfloat/multilingual-e5-large/resolve/main/model.safetensors: HTTPSConnectionPool(host='cas-bridge.xethub.hf.co', port=443): Read timed out.
Trying to resume download...
Error while downloading

‚úÖ multilingual-e5-large embeddings loaded


In [34]:
chunked_docs  # output of RecursiveCharacterTextSplitter


[Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 20.0 (Windows)', 'creationdate': '2025-06-28T11:38:46+05:30', 'source': 'C:\\Users\\Sujal\\PROJECTS\\MiniProject\\NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'file_path': 'C:\\Users\\Sujal\\PROJECTS\\MiniProject\\NcertData\\Class 9th\\Hindi\\hecu101.pdf', 'total_pages': 7, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': 'Hindi', 'keywords': '', 'moddate': '2025-07-23T15:34:29+05:30', 'trapped': '', 'modDate': "D:20250723153429+05'30'", 'creationDate': "D:20250628113846+05'30'", 'page': 0, 'class': 'Class 9th', 'language': 'unknown', 'board': 'NCERT', 'content_type': 'textbook'}, page_content='Chapter 1\u2009‚Äî\u2009Exploring the Investigative World of Science\n1\nExploring the \nInvestigative \nWorld of Science\n1\nDear Young Scientists, \nWelcome back! On the first page of each chapter, you will find a set of questions. \nThese are not meant for any exam‚Äî\u2009they are unique invitations 

In [35]:
from langchain_core.documents import Document


e5_docs = []

for doc in chunked_docs:
    e5_docs.append(
        Document(
            page_content="passage: " + doc.page_content,
            metadata=doc.metadata
        )
    )

print(f"Documents ready for embedding: {len(e5_docs)}")


Documents ready for embedding: 4238


In [36]:
##query = "Explain Newton's first law in simple terms"

##query_embedding = embeddings.embed_query(
  ##  "query: " + query
##)


In [37]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma.from_documents(
    documents=e5_docs,
    embedding=embeddings,
    collection_name="ncert_multilingual"
)

print("‚úÖ Chroma vector store created")


‚úÖ Chroma vector store created
