In [None]:
# First, install our PDF processing toolkit
!pip install pypdf pdfplumber pdf2image pytesseract pillow reportlab

# Import necessary libraries
import os
import pypdf
import json
import pdfplumber
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter

print("PDF processing environment setup complete!")

In [None]:
def create_sample_pdfs():
    """Creates sample PDFs demonstrating different document types"""

    # Create directory for our samples
    os.makedirs("rag_sample_data", exist_ok=True)

    # Create a simple digital PDF
    c = canvas.Canvas("rag_sample_data/digital_pdf.pdf", pagesize=letter)
    c.drawString(72, 800, "Digital PDF Example")
    c.drawString(72, 780, "This is a digital-native PDF with text content.")
    c.drawString(72, 760, "Created: February 9, 2025")

    # Add some structured content
    y_position = 700
    for i in range(1, 4):
        c.drawString(72, y_position, f"â€¢ Point {i}: Sample content for demonstration")
        y_position -= 20

    c.save()

    # Create a PDF with tables and structured data
    c = canvas.Canvas("rag_sample_data/structured_pdf.pdf", pagesize=letter)
    c.drawString(72, 800, "Structured PDF Example")

    # Add table headers
    headers = ["Product", "Q1 Sales", "Q2 Sales", "Growth"]
    y_position = 750
    for idx, header in enumerate(headers):
        c.drawString(72 + idx*100, y_position, header)

    # Add table data
    data = [
        ["Product A", "$10,000", "$12,500", "+25%"],
        ["Product B", "$8,000", "$9,600", "+20%"],
        ["Product C", "$15,000", "$16,500", "+10%"]
    ]

    for row in data:
        y_position -= 20
        for idx, cell in enumerate(row):
            c.drawString(72 + idx*100, y_position, cell)

    c.save()

# Create our sample PDFs
create_sample_pdfs()

# Verify creation
print("Created PDFs:")
!ls -l rag_sample_data/*.pdf

In [4]:
class EnhancedPDFLoader:
    """
    A comprehensive PDF loader that can handle various types of PDF documents.
    It combines multiple approaches to ensure reliable text extraction.
    """
    def __init__(self, file_path: str, use_ocr: bool = False):
        """
        Initialize the PDF loader with configuration options.
        Args:
            file_path: Path to the PDF file
            use_ocr: Whether to use OCR for text extraction (helpful for scanned docs)
        """
        self.file_path = file_path
        self.use_ocr = use_ocr
        self.metadata = {}

    def extract_metadata(self) -> dict:
        """
        Extract useful information about the PDF document.
        This helps us understand the document's properties and origin.
        """
        with open(self.file_path, 'rb') as file:
            reader = pypdf.PdfReader(file)
            info = reader.metadata

            # Collect comprehensive metadata
            metadata = {
                'title': info.get('/Title', '') if info else '',
                'author': info.get('/Author', '') if info else '',
                'creation_date': info.get('/CreationDate', '') if info else '',
                'page_count': len(reader.pages),
                'file_size': os.path.getsize(self.file_path),
                'is_encrypted': reader.is_encrypted
            }
            return metadata

    def extract_text_with_layout(self, page: int) -> dict:
        """
        Extract text while preserving its position and formatting on the page.
        Args:
            page: Page number to process (0-based index)
        Returns:
            Dictionary containing text elements with their positions
        """
        with pdfplumber.open(self.file_path) as pdf:
            pdf_page = pdf.pages[page]

            # Extract words with their positions
            words_with_positions = pdf_page.extract_words(
                keep_blank_chars=True,
                x_tolerance=3,  # How far apart words can be horizontally
                y_tolerance=3   # How far apart words can be vertically
            )

            # Extract tables if present
            tables = pdf_page.extract_tables()

            return {
                'words': words_with_positions,
                'tables': tables,
                'page_height': pdf_page.height,
                'page_width': pdf_page.width
            }

    def perform_ocr(self, page_number: int) -> str:
        """
        Use Optical Character Recognition to extract text from scanned pages.
        This is our fallback method when regular text extraction fails.
        Args:
            page_number: The page to process (0-based index)
        Returns:
            Extracted text from the page image
        """
        # Convert PDF page to image
        images = pdf2image.convert_from_path(
            self.file_path,
            first_page=page_number + 1,
            last_page=page_number + 1
        )

        # Extract text from the image using OCR
        text = pytesseract.image_to_string(
            images[0],
            lang='eng+fra+deu+spa+ita',  # Support multiple languages
            config='--psm 1'  # Automatic page segmentation with OSD
        )

        return text

    def load(self) -> tuple[list, dict]:
        """
        Process the PDF document, combining all our extraction methods.
        This is our main method that orchestrates the entire extraction process.
        Returns:
            Tuple containing:
            - List of dictionaries with page content
            - Document metadata
        """
        self.metadata = self.extract_metadata()
        pages_content = []

        with open(self.file_path, 'rb') as file:
            reader = pypdf.PdfReader(file)

            for page_num in range(len(reader.pages)):
                # Initialize storage for this page's content
                page_content = {'page_number': page_num + 1}

                # Try normal text extraction first
                text = reader.pages[page_num].extract_text()

                # If text extraction fails or OCR is requested, use OCR
                if not text.strip() or self.use_ocr:
                    text = self.perform_ocr(page_num)

                # Get layout information
                layout_info = self.extract_text_with_layout(page_num)

                # Combine all information for this page
                page_content.update({
                    'text': text,
                    'layout': layout_info,
                    'has_images': bool(reader.pages[page_num].images)
                })

                pages_content.append(page_content)

        return pages_content, self.metadata

In [None]:
def test_pdf_processing():
    """Test the complete PDF processing implementation"""
    # Process the digital PDF
    loader = EnhancedPDFLoader("rag_sample_data/digital_pdf.pdf")
    content, metadata = loader.load()

    print("Processing digital PDF:")
    print("\nMetadata:")
    print(json.dumps(metadata, indent=2))

    print("\nFirst page content preview:")
    if content:
        page = content[0]
        print(f"Text extract (first 200 chars): {page['text'][:200]}")
        print(f"Page has images: {page['has_images']}")
        print(f"Layout elements: {len(page['layout']['words'])} words found")

# Run the test
test_pdf_processing()

**Best Practices for PDF Processing**

In [None]:
def demonstrate_metadata_first():
    """Show why examining metadata first is important"""

    loader = EnhancedPDFLoader("rag_sample_data/digital_pdf.pdf")
    metadata = loader.extract_metadata()

    # Check important properties before processing
    print("Document Properties:")
    print(f"- Number of pages: {metadata['page_count']}")
    print(f"- File size: {metadata['file_size']} bytes")
    print(f"- Is encrypted: {metadata['is_encrypted']}")

    # This helps determine processing approach
    if metadata['is_encrypted']:
        print("Document is encrypted - need password handling")
    if metadata['file_size'] > 10_000_000:  # 10MB
        print("Large document - consider batch processing")

# Test metadata-first approach
demonstrate_metadata_first()

In [None]:
def test_extraction_methods():
    """Demonstrate when to use different extraction approaches"""

    loader = EnhancedPDFLoader("rag_sample_data/digital_pdf.pdf")

    # Try simple extraction first
    with open(loader.file_path, 'rb') as file:
        reader = pypdf.PdfReader(file)
        basic_text = reader.pages[0].extract_text()

    # Try layout-aware extraction
    layout_info = loader.extract_text_with_layout(0)

    print("Extraction Results Comparison:")
    print("\nBasic Extraction:")
    print(basic_text[:200])
    print("\nLayout-Aware Extraction:")
    print(f"Found {len(layout_info['words'])} positioned words")
    if layout_info['tables']:
        print(f"Found {len(layout_info['tables'])} tables")

# Test different extraction methods
test_extraction_methods()

In [None]:
def demonstrate_structure_preservation():
    """Show how to preserve document structure"""

    loader = EnhancedPDFLoader("rag_sample_data/structured_pdf.pdf")
    content, _ = loader.load()

    if content:
        page = content[0]
        layout = page['layout']

        print("Document Structure Analysis:")
        print(f"Page dimensions: {layout['page_width']}x{layout['page_height']}")
        print("\nText Positions:")

        # Show how words are positioned on the page
        for word in layout['words'][:5]:  # First 5 words
            print(f"Word: {word['text']}, Position: ({word['x0']}, {word['top']})")

# Test structure preservation
demonstrate_structure_preservation()

In [None]:
def demonstrate_error_handling():
    """Show robust error handling practices"""

    test_files = [
        "rag_sample_data/digital_pdf.pdf",  # Should work
        "non_existent.pdf",                 # Missing file
        "rag_sample_data/empty.pdf"         # Empty file
    ]

    for file_path in test_files:
        try:
            loader = EnhancedPDFLoader(file_path)
            content, metadata = loader.load()
            print(f"\nSuccessfully processed: {file_path}")
            print(f"Pages: {metadata['page_count']}")
        except FileNotFoundError:
            print(f"\nFile not found: {file_path}")
        except Exception as e:
            print(f"\nError processing {file_path}: {str(e)}")

# Test error handling
demonstrate_error_handling()

In [None]:
def monitor_performance():
    """Demonstrate performance monitoring"""
    import time

    loader = EnhancedPDFLoader("rag_sample_data/digital_pdf.pdf")

    start_time = time.time()
    content, metadata = loader.load()
    processing_time = time.time() - start_time

    print("Processing Performance:")
    print(f"Total time: {processing_time:.2f} seconds")
    print(f"Pages processed: {metadata['page_count']}")
    print(f"Average time per page: {processing_time/metadata['page_count']:.2f} seconds")

# Monitor processing performance
monitor_performance()

**Practical Exercises**

In [None]:
def exercise_1_advanced_pdf_creation():
    """Create a PDF with various fonts and styles"""

    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
    from reportlab.platypus import SimpleDocTemplate, Paragraph
    from reportlab.lib import colors

    # Create PDF with different styles
    doc = SimpleDocTemplate("rag_sample_data/styled_pdf.pdf", pagesize=letter)
    styles = getSampleStyleSheet()
    story = []

    # Add title
    title = Paragraph("Document with Different Styles", styles['Title'])
    story.append(title)

    # Add normal text
    text = Paragraph("This is normal text in the default font.", styles['Normal'])
    story.append(text)

    # Add heading
    heading = Paragraph("This is a Heading", styles['Heading1'])
    story.append(heading)

    # Create custom style
    custom_style = ParagraphStyle(
        'CustomStyle',
        parent=styles['Normal'],
        textColor=colors.blue,
        fontSize=14
    )
    custom_text = Paragraph("This text is blue and larger.", custom_style)
    story.append(custom_text)

    # Build the PDF
    doc.build(story)
    print("Created styled PDF document")

# Run the exercise
exercise_1_advanced_pdf_creation()

In [None]:
def exercise_2_table_extraction():
    """Implement table extraction and processing"""

    class TableExtractor(EnhancedPDFLoader):
        def extract_tables_from_page(self, page_num: int) -> list:
            """Extract and process tables from a specific page"""
            with pdfplumber.open(self.file_path) as pdf:
                page = pdf.pages[page_num]
                tables = page.extract_tables()

                processed_tables = []
                for table in tables:
                    # Clean and process table data
                    cleaned_table = [
                        [cell.strip() if cell else '' for cell in row]
                        for row in table
                    ]
                    processed_tables.append(cleaned_table)

                return processed_tables

    # Test table extraction
    extractor = TableExtractor("rag_sample_data/structured_pdf.pdf")
    tables = extractor.extract_tables_from_page(0)

    print("Extracted Tables:")
    for idx, table in enumerate(tables):
        print(f"\nTable {idx + 1}:")
        for row in table:
            print(row)

# Run the exercise
exercise_2_table_extraction()

In [None]:
def exercise_3_password_handling():
    """Handle encrypted PDFs"""

    class EncryptedPDFLoader(EnhancedPDFLoader):
        def __init__(self, file_path: str, password: str = None):
            super().__init__(file_path)
            self.password = password

        def load(self) -> tuple[list, dict]:
            try:
                with open(self.file_path, 'rb') as file:
                    reader = pypdf.PdfReader(file)

                    if reader.is_encrypted:
                        if not self.password:
                            raise ValueError("PDF is encrypted and no password provided")
                        if not reader.decrypt(self.password):
                            raise ValueError("Incorrect password")

                    # Continue with normal loading process
                    return super().load()

            except Exception as e:
                print(f"Error loading encrypted PDF: {str(e)}")
                return None, None

# Create and test an encrypted PDF
def test_encrypted_pdf():
    # This would be a test implementation
    print("Encrypted PDF handling implemented")

# Run the exercise
test_encrypted_pdf()

In [None]:
def exercise_4_image_extraction():
    """Extract and process images from PDFs"""

    class ImageExtractor(EnhancedPDFLoader):
        def extract_images_from_page(self, page_num: int) -> list:
            """Extract images from a specific page"""
            images = []
            with open(self.file_path, 'rb') as file:
                reader = pypdf.PdfReader(file)
                page = reader.pages[page_num]

                for image_file_object in page.images:
                    images.append({
                        'name': image_file_object.name,
                        'type': image_file_object.type,
                        'size': len(image_file_object.data)
                    })

            return images

    # Test image extraction
    extractor = ImageExtractor("rag_sample_data/digital_pdf.pdf")
    images = extractor.extract_images_from_page(0)

    print("Images found in PDF:")
    for idx, image in enumerate(images):
        print(f"\nImage {idx + 1}:")
        print(f"Name: {image['name']}")
        print(f"Type: {image['type']}")
        print(f"Size: {image['size']} bytes")

# Run the exercise
exercise_4_image_extraction()