### RAG Pipeline- Data Ingestion to Vector DB Pipeline

In [1]:
import os
from langchain_community.document_loaders import PyPDFLoader, PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from pathlib import Path


In [2]:
### Read all PDF inside the directory
def process_all_pdfs(pdf_directory):
    """ Process all PDF files in a directory """
    all_documents = []
    pdf_dir = Path(pdf_directory)

    # Find all PDF files recursively
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF files to Process")

    for pdf_file in pdf_files:
        print(f"\nProcessing: {pdf_file.name}")
        try:
            loader = PyPDFLoader(str(pdf_file))
            documents = loader.load()

            # Add source information to metadata
            for doc in documents:
                doc.metadata['source_file'] = pdf_file.name
                doc.metadata['file_type'] = 'pdf'
            
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages")

        except Exception as e:
            print(f"X Error: {e}")

    print(f"\nTotal documents loaded: {len(all_documents)}")
    return all_documents

all_pdf_documents = process_all_pdfs("../data/pdf")

Found 2 PDF files to Process

Processing: jobData.pdf
Loaded 1 pages

Processing: Sandra-Ubenyi.pdf
Loaded 2 pages

Total documents loaded: 3


In [4]:
### Text splitting getting to chunks

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    ### Split documents into smaller chunks for better RAG performance ###
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size= chunk_size,
        chunk_overlap= chunk_overlap,
        length_function= len,
        separators= ["\n\n", "\n", " ", ""]
    )

    split_docs = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(split_docs)} chunks")

    if split_docs:
        print(f"\n Example chunk: ")
        print(f"Content: {split_docs[0].page_content[:200]}...")
        print(f"Metadata: {split_docs[0].metadata}")
        
    return split_docs

In [7]:
chunks= split_documents(all_pdf_documents)

Split 3 documents into 5 chunks

 Example chunk: 
Content: Greg Udogu
SQL Database Engineer
Company: Bootspace Technology
Start Date: Mon Jul 14 2025
End Date: Tue Nov 18 2025
Job Type: Gig
Work Mode: Remote
Skills & Requirements:
SQL, Database Design...
Metadata: {'producer': 'PDFKit', 'creator': 'PDFKit', 'creationdate': '2025-07-15T15:50:37+00:00', 'source': '../data/pdf/jobData.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'jobData.pdf', 'file_type': 'pdf'}


In [8]:
chunks

[Document(metadata={'producer': 'PDFKit', 'creator': 'PDFKit', 'creationdate': '2025-07-15T15:50:37+00:00', 'source': '../data/pdf/jobData.pdf', 'total_pages': 1, 'page': 0, 'page_label': '1', 'source_file': 'jobData.pdf', 'file_type': 'pdf'}, page_content='Greg Udogu\nSQL Database Engineer\nCompany: Bootspace Technology\nStart Date: Mon Jul 14 2025\nEnd Date: Tue Nov 18 2025\nJob Type: Gig\nWork Mode: Remote\nSkills & Requirements:\nSQL, Database Design'),
 Document(metadata={'producer': 'Apache FOP Version 2.3', 'creator': 'Indeed Resume', 'creationdate': '2025-11-28T07:41:18-06:00', 'title': 'Indeed Resume', 'author': 'Indeed', 'keywords': 'Indeed Resume', 'source': '../data/pdf/Sandra-Ubenyi.pdf', 'total_pages': 2, 'page': 0, 'page_label': '1', 'source_file': 'Sandra-Ubenyi.pdf', 'file_type': 'pdf'}, page_content='SANDRA UBENYI\nPort Harcourt, Rivers\nubenyisandra@gmail.com\n• Detail-oriented administrative professional with 4+ years’ experience providing scheduling, client\ncommun