In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pprint import pprint

In [4]:
%run ./utils.ipynb

✅ Successfully connected to MongoDB.


In [22]:
# client = get_mongo_client(uri)
# pages_collection = get_collection(client, "pdf_rag_db", "pages")

In [23]:
# # Fetch all pages for all PDFs
# documents = list(pages_collection.find({}))
# print(f"✅ Fetched {len(documents)} page-level docs from Mongo.")

In [24]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=500,
#     chunk_overlap=100,
#     separators=["\n\n", "\n", ".", " ", ""]
# )

In [25]:
# chunked_docs = []

# for doc in documents:
#     chunks = text_splitter.split_text(doc["text"])
#     for idx, chunk in enumerate(chunks):
#         chunked_docs.append({
#             "pdf_name": doc["pdf_name"],
#             "page_number": doc["page_number"],
#             "chunk_index": idx,
#             "chunk_text": chunk
#         })

# print(f"✅ Generated {len(chunked_docs)} total chunks.")

In [26]:
# pprint(chunked_docs[:100])

In [5]:
def chunking(
    db_name: str = "pdf_rag_db",
    collection_name: str = "pages",
    chunk_size: int = 500,
    chunk_overlap: int = 100
) -> List[Dict]:
    """
    Fetch all PDF pages from MongoDB and return chunked versions of their text.

    Returns:
        List of dicts like:
        {
            "pdf_name": str,
            "page_number": int,
            "chunk_index": int,
            "chunk_text": str
        }
    """
    # Connect to MongoDB and get the pages collection
    client = get_mongo_client(uri)
    pages_collection = get_collection(client, db_name, collection_name)
    
    # Fetch all page documents
    documents = list(pages_collection.find({}))
    print(f"✅ Fetched {len(documents)} pages from MongoDB.")

    # Initialize the recursive text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", " ", ""]
    )

    # Process and chunk each document
    chunked_docs = []
    for doc in documents:
        chunks = text_splitter.split_text(doc["text"])
        for idx, chunk in enumerate(chunks):
            chunked_docs.append({
                "pdf_name": doc["pdf_name"],
                "page_number": doc["page_number"],
                "chunk_index": idx,
                "chunk_text": chunk
            })

    print(f"✅ Created {len(chunked_docs)} total chunks.")
    return chunked_docs

In [6]:
chunked_docs = chunking()

✅ Fetched 66 pages from MongoDB.
✅ Created 344 total chunks.


In [7]:
chunked_docs[1]

{'pdf_name': 'metformin2.pdf',
 'page_number': 2,
 'chunk_index': 0,
 'chunk_text': '2\nPRODUCT  MONOGRAPH \n \n \n \nTEVA-METFORMIN \n(metformin hydrochloride) \n \n \n500 mg and 850 mg Tablets \n \n \n \nTHERAPEUTIC  CLASSIFICATION \n \nOral Antihyperglycemic Agent \n \n \nACTIONS  AND  CLINICAL  PHARMACOLOGY \n \nTEVA-METFORMIN (metformin hydrochloride) is a biguanide derivative producing an \nantihyperglycemic effect which can only be observed in man or in the diabetic animal and only \nwhen there is insulin secretion. Metformin, at therapeutic doses, does not cause hypoglycemia'}

In [8]:
insert_chunks_to_mongo(chunked_docs)

✅ Inserted 344 chunks into MongoDB collection 'chunks'.
