### RAG PIPLINES

In [4]:
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from pathlib import Path

In [5]:
### Read all tyhe pdf's inside the directory

def process_all_pdfs(pdf_directory):
    """Processes all the pdf files in the directory"""
    all_documents=[]
    pdf_dir = Path(pdf_directory)

    # Find all the PDF files "**/*.pdf"
    pdf_files = list(pdf_dir.glob("**/*.pdf"))

    print(f"Found {len(pdf_files)} PDF Files to Process")

    for pdf_file in pdf_files:
        print(f"\n processing : {pdf_file.name}")

        try:
            loader = PyMuPDFLoader(str(pdf_file))

            document = loader.load()

            # Add source information to metadata

            for doc in document:
                doc.metadata['source_file']=pdf_file.name
                doc.metadata['file_type']='pdf'
            all_documents.extend(document)
            print(f" Loaded {len(document)} pages")
        
        except Exception as e:
            print(f" Error: {e}")
    
    print(f"\nTotal documents loaded : {len(all_documents)}")

    return all_documents


In [6]:
all_pdf_documents = process_all_pdfs("../data")

Found 3 PDF Files to Process

 processing : 1Notes.pdf
 Loaded 2 pages

 processing : 2Notes.pdf
 Loaded 4 pages

 processing : 3notes.pdf
 Loaded 5 pages

Total documents loaded : 11


In [7]:
all_pdf_documents

[Document(metadata={'producer': 'Skia/PDF m137 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\1Notes.pdf', 'file_path': '..\\data\\pdf\\1Notes.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': 'Untitled document', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': '1Notes.pdf', 'file_type': 'pdf'}, page_content='Lecture 1 Notes :\u200b\n\u200b\n1.\u2060 \u2060What is Low‑Level Design (LLD)? \n \nDefinition: Designing the internal structure (“skeleton”) of an application by identifying \nclasses/objects, their relationships, data flows, and how DSA solutions plug into this \nstructure. \n●\u200b DSA: Solves isolated problems (e.g. “find shortest path in an array/graph”) using \nalgorithms like binary search, quicksort, Dijkstra’s, heaps, etc. \n●\u200b LLD: Determines which objects exist in the system and how they interact, then \napplies DSA inside that structur

In [8]:
###  Text spliting for getting into Chunking

def split_documents(documents,chunk_size=1000,chunk_overlap=200):
    """Split documents ointo smaller chunks for better Rag Performance"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n","\n"," ",""]
    )

    split_doc = text_splitter.split_documents(documents)

    print(f"Split {len(documents)} documents into {len(split_doc)} chunks")

    # Show example of a chunk

    if split_doc:
        print(f"\nExample chunk:")
        print(f"Content: {split_doc[0].page_content[:200]}...")
        print(f"Metadata: {split_doc[0].metadata}")
    return split_doc

In [9]:
chunks = split_documents(all_pdf_documents)
chunks

Split 11 documents into 18 chunks

Example chunk:
Content: Lecture 1 Notes :​
​
1.⁠ ⁠What is Low‑Level Design (LLD)? 
 
Definition: Designing the internal structure (“skeleton”) of an application by identifying 
classes/objects, their relationships, data flow...
Metadata: {'producer': 'Skia/PDF m137 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\1Notes.pdf', 'file_path': '..\\data\\pdf\\1Notes.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': 'Untitled document', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': '1Notes.pdf', 'file_type': 'pdf'}


[Document(metadata={'producer': 'Skia/PDF m137 Google Docs Renderer', 'creator': '', 'creationdate': '', 'source': '..\\data\\pdf\\1Notes.pdf', 'file_path': '..\\data\\pdf\\1Notes.pdf', 'total_pages': 2, 'format': 'PDF 1.4', 'title': 'Untitled document', 'author': '', 'subject': '', 'keywords': '', 'moddate': '', 'trapped': '', 'modDate': '', 'creationDate': '', 'page': 0, 'source_file': '1Notes.pdf', 'file_type': 'pdf'}, page_content='Lecture 1 Notes :\u200b\n\u200b\n1.\u2060 \u2060What is Low‑Level Design (LLD)? \n \nDefinition: Designing the internal structure (“skeleton”) of an application by identifying \nclasses/objects, their relationships, data flows, and how DSA solutions plug into this \nstructure. \n●\u200b DSA: Solves isolated problems (e.g. “find shortest path in an array/graph”) using \nalgorithms like binary search, quicksort, Dijkstra’s, heaps, etc. \n●\u200b LLD: Determines which objects exist in the system and how they interact, then \napplies DSA inside that structur