In [42]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
)

In [43]:
try:
    pypdf_loader = PyPDFLoader('data/pdf/DeepLearning_AI-Playbook_v6.pdf')
    pypdf_doc = pypdf_loader.load()
    print(f'Loaded {len(pypdf_doc)} pages using PyPDFLoader')
    print(f'pypdf_doc {pypdf_doc[0].metadata}')
    print('----')
    print(f'pypdf_doc {pypdf_doc[0].page_content[:200]}...')
except Exception as e:
    print(f"Error: {e}")

Loaded 6 pages using PyPDFLoader
pypdf_doc {'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.2 (Macintosh)', 'creationdate': '2024-02-23T14:55:51-05:00', 'moddate': '2024-02-23T14:57:41-05:00', 'trapped': '/False', 'source': 'data/pdf/DeepLearning_AI-Playbook_v6.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}
----
pypdf_doc © DeepLearning.AI  All rights reserved. | deeplearning.ai 1
AI (Artificial Intelligence) technology is now 
poised to transform every industry, just as 
electricity did 100 years ago. Between now and ...


In [44]:
try:
    pymupdf_loader = PyMuPDFLoader('data/pdf/DeepLearning_AI-Playbook_v6.pdf')
    pymupdf_doc = pymupdf_loader.load()
    print(f'Loaded {len(pymupdf_doc)} pages using PyMuPDFLoader')
    print(f'pymupdf_doc {pymupdf_doc[0].metadata}')
    print('----')
    print(f'pymupdf_doc {pymupdf_doc[0].page_content[:200]}...')
except Exception as e:
    print(f'Error: {e}')

Loaded 6 pages using PyMuPDFLoader
pymupdf_doc {'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 19.2 (Macintosh)', 'creationdate': '2024-02-23T14:55:51-05:00', 'source': 'data/pdf/DeepLearning_AI-Playbook_v6.pdf', 'file_path': 'data/pdf/DeepLearning_AI-Playbook_v6.pdf', 'total_pages': 6, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-02-23T14:57:41-05:00', 'trapped': '', 'modDate': "D:20240223145741-05'00'", 'creationDate': "D:20240223145551-05'00'", 'page': 0}
----
pymupdf_doc © DeepLearning.AI  All rights reserved. | deeplearning.ai
1
AI (Artificial Intelligence) technology is now 
poised to transform every industry, just as 
electricity did 100 years ago. Between now and ...


## Handling Pdf Challenges

In [45]:
text = """In publishing and graphic design, Lorem ipsum is a placeholder text commonly used to demonstrate the visual
 form of a document or a typeface without relying on meaningful content. 
 Lorem ipsum may be used as a placeholder before the final copy is available."""

text = " ".join(text.split())


In [46]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [47]:
rew_pdf_text = """In publishing and graphic design, Lorem ipsum is a placeholder text commonly used to demonstrate the visual


 form of a document or a typeface without relying on meaningful content.

 
 Lorem ipsum may be used as a placeholder before the final copy is available."""

def clean_text(text):
    text = " ".join(text.split())
    return text

cleaned_text = clean_text(rew_pdf_text)
print(cleaned_text)

In publishing and graphic design, Lorem ipsum is a placeholder text commonly used to demonstrate the visual form of a document or a typeface without relying on meaningful content. Lorem ipsum may be used as a placeholder before the final copy is available.


In [52]:
from langchain_core.documents import Document
from typing import List

class SmartPDFProcessor:
    """Advanced PDF processing with error handling"""
    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=[" "]
        )

    def process_pdf(self,pdf_path:str)->List[Document]:
        """Process PDF with smart chunking and metadata enhancement"""

        # Laod PDF

        loader=PyPDFLoader(pdf_path)
        pages=loader.load()

        ## Process each page

        processed_chunks=[]

        for page_num,page in enumerate(pages):
            ## clean text
            cleaned_text=self._clean_text(page.page_content)

            # Skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue

            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)

        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

    
              


In [53]:
preprocessor = SmartPDFProcessor()

In [54]:
preprocessor

<__main__.SmartPDFProcessor at 0x242d8733800>

In [56]:
try:
    smart_chunks = preprocessor.process_pdf('data/pdf/DeepLearning_AI-Playbook_v6.pdf')
    print(f'Generated {len(smart_chunks)} smart chunks from PDF')

    for key,value in smart_chunks[0].metadata.items():
        print(f'{key}: {value}')
except Exception as e:
    print(f"Error: {e}")

Generated 24 smart chunks from PDF
producer: Adobe PDF Library 17.0
creator: Adobe InDesign 19.2 (Macintosh)
creationdate: 2024-02-23T14:55:51-05:00
moddate: 2024-02-23T14:57:41-05:00
trapped: /False
source: data/pdf/DeepLearning_AI-Playbook_v6.pdf
total_pages: 6
page: 1
page_label: 1
chunk_method: smart_pdf_processor
char_count: 2955
