### RAG Pipelines- Data Ingestion to Vector DB Pipeline 

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [7]:
def process_all_pdf(pdf_directory):
    """Process all PDF files in a directory."""
    all_documents = []
    pdf_dir = Path(pdf_directory)

    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to process.")

    for pdf_file in pdf_files:
        print(f"\n Processing file: {pdf_file.name}")
        try:
            loader = PyMuPDFLoader(str(pdf_file))
            documents = loader.load()

            for doc in documents:
                doc.metadata["source_file"]= pdf_file.name
                doc.metadata["file_type"]= "pdf"
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {pdf_file.name}")

        except Exception as e:
            print(f"Error loading {pdf_file.name}: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents            

all_pdf_documents=process_all_pdf("../data/pdf")



Found 1 PDF files to process.

 Processing file: agent_ppt.pdf
Loaded 43 pages from agent_ppt.pdf

Total documents loaded: 43


In [5]:
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """Split the document into smaller chunks for better RAG performance."""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )
    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks.")


    if split_docs:
        print("Sample chunk")
        print(f"Content: {split_docs[0].page_content[:200]}....")
        print(f"Metadata: {split_docs[0].metadata}")

    return split_docs


In [8]:
chunks = split_documents(all_pdf_documents)

Split 43 documents into 46 chunks.
Sample chunk
Content: LangChain Essentials
1....
Metadata: {'producer': '', 'creator': 'Google', 'creationdate': '', 'source': '..\\data\\pdf\\agent_ppt.pdf', 'file_path': '..\\data\\pdf\\agent_ppt.pdf', 'total_pages': 43, 'format': 'PDF 1.4', 'title': 'LangChain V1 Essentials', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': 'agent_ppt.pdf', 'file_type': 'pdf'}
