In [65]:
import os
import fitz  # PyMuPDF
import pdfplumber
from pdfminer.high_level import extract_text
from langchain.schema import Document
from langchain.document_loaders.base import BaseLoader
from langchain.chat_models import AzureChatOpenAI
from langchain.schema.messages import HumanMessage
import base64
from typing import List, Dict, Any, Optional

class HybridPDFLoader(BaseLoader):
    """Single-class PDF loader with smart library combination"""
    
    def __init__(self, file_path: str):
        self.file_path = file_path
        self.azure_llm = None
        
        # Create extracted images folder
        self.images_folder = os.path.join(os.path.dirname(file_path), "extracted_images")
        os.makedirs(self.images_folder, exist_ok=True)

        try:
            self.azure_llm = AzureChatOpenAI(
                deployment_name=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
                api_key=os.getenv("AZURE_OPENAI_API_KEY"),
                api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
                azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT")
            )
            print("Azure OpenAI initialized successfully")
        except Exception as e:
            print(f"Azure OpenAI setup failed: {e}")

    def _get_image_description(self, image_bytes: bytes, image_ext: str, page_number: int, img_index: int) -> str:
        """Generate AI description for image using Azure OpenAI"""
        if not self.azure_llm or not image_bytes:
            return "Image extracted successfully."
        
        try:
            # Convert image to base64
            image_b64 = base64.b64encode(image_bytes).decode()

            # Few-shot examples for guiding the model
            examples = """
Example 1: page1_image1 (architecture of rag with sequential (parallel(input->chunking->embedding->vector store, user input->embedding), retrieval->LLM->output))

Example 2: page2_image2 (profit per company bar chart
company | profit
company1 | 100 units
company2 | 200 units

red color represents company1
blue color represents company2

x-label = company name
y-label = profit)
"""

            message = HumanMessage(content=[
                {"type": "text", "text": f"You are given an image extracted from a PDF. "
                                        f"Write a short structured summary. "
                                        f"If the image is a graph or chart, include axis labels, values, and colors. "
                                        f"Follow the examples below:\n\n{examples}\n\n"
                                        f"Now describe this image as page{page_number}_image{img_index} (...)."},
                {"type": "image_url", "image_url": {"url": f"data:image/{image_ext};base64,{image_b64}"}}
            ])

            response = self.azure_llm.invoke([message])
            description = response.content.strip()
            print(f"Generated description for page{page_number}_image{img_index}: {description[:100]}...")
            return description
            
        except Exception as llm_e:
            print(f"Azure OpenAI failed for image {img_index} on page {page_number}: {llm_e}")
            return "Image extracted successfully."

    def load(self) -> List[Document]:
        """Single method that extracts text, tables, and images with smart library combination"""
        documents = []
        
        try:
            # Open with all three libraries simultaneously for efficiency
            doc = fitz.open(self.file_path)  # PyMuPDF for images & page count
            plumber_doc = pdfplumber.open(self.file_path)  # pdfplumber for tables & coordinates
            
            num_pages = len(doc)
            print(f"Processing PDF with {num_pages} pages")
            
            for page_num in range(num_pages):
                page_number = page_num + 1
                content_parts = []
                
                # Initialize comprehensive metadata
                metadata = {
                    "source": self.file_path,
                    "page_number": page_number,
                    "page_index": page_num,
                    "total_pages": num_pages,
                    "document_index": page_num,
                    "elements": {
                        "text": [],
                        "tables": [],
                        "images": []
                    }
                }
                
                # TEXT LOADER - Use pdfminer.six + pdfplumber coordinates
                try:
                    # Get clean text from pdfminer
                    text = extract_text(self.file_path, page_numbers=[page_num])
                    if text and text.strip():
                        content_parts.append(text.strip())
                    
                    # Get word coordinates from pdfplumber
                    page = plumber_doc.pages[page_num]
                    words = page.extract_words()
                    
                    # Add text metadata with indexing
                    for i, word in enumerate(words):
                        metadata["elements"]["text"].append({
                            "index": i,
                            "text": word['text'],
                            "bbox": (word['x0'], word['top'], word['x1'], word['bottom']),
                            "font": word.get('fontname', 'Unknown'),
                            "size": word.get('size', 0)
                        })
                        
                except Exception as e:
                    print(f"Text extraction error page {page_number}: {e}")
                
                # TABLE LOADER - Use pdfplumber
                try:
                    page = plumber_doc.pages[page_num]
                    tables = page.extract_tables()
                    
                    for i, table in enumerate(tables):
                        if table:
                            # Clean table data
                            cleaned_table = [[str(cell) if cell is not None else "" for cell in row] for row in table]
                            table_str = '\n'.join([' | '.join(row) for row in cleaned_table])
                            
                            # Get table coordinates
                            table_bbox = None
                            try:
                                found_tables = page.find_tables()
                                if i < len(found_tables):
                                    table_bbox = found_tables[i].bbox
                            except:
                                pass
                            
                            # Add to content and metadata
                            content_parts.append(f"\n[Table {i+1}]\n{table_str}")
                            metadata["elements"]["tables"].append({
                                "index": i,
                                "content": cleaned_table,
                                "bbox": table_bbox,
                                "rows": len(cleaned_table),
                                "cols": len(cleaned_table[0]) if cleaned_table else 0
                            })
                            
                except Exception as e:
                    print(f"Table extraction error page {page_number}: {e}")
                
                # IMAGE LOADER - Use PyMuPDF with Azure OpenAI descriptions
                try:
                    page_obj = doc[page_num]
                    images = page_obj.get_images(full=True)
                    print(f"Page {page_number}: Found {len(images)} images")

                    for img_index, img in enumerate(images, start=1):
                        try:
                            xref = img[0]  # image reference
                            base_image = doc.extract_image(xref)
                            image_bytes = base_image["image"]
                            image_ext = base_image["ext"]

                            # Get image coordinates
                            img_rects = page_obj.get_image_rects(img)
                            bbox = tuple(img_rects[0]) if img_rects else None

                            # Build filename
                            image_filename = f"page{page_number}_img{img_index}.{image_ext}"
                            image_path = os.path.join(self.images_folder, image_filename)

                            # Save image
                            with open(image_path, "wb") as f:
                                f.write(image_bytes)

                            print(f"Saved: {image_filename}")

                            # Generate AI image description using Azure OpenAI
                            image_name = f"Image_{img_index}_Page_{page_number}"
                            image_description = self._get_image_description(image_bytes, image_ext, page_number, img_index)
                            
                            # Add to content in the exact format you specified
                            img_content = f"\n\n[Image: {image_name}]\nDescription: {image_description}"
                            content_parts.append(img_content)
                            
                            # Add to metadata
                            metadata["elements"]["images"].append({
                                "index": img_index - 1,  # 0-based index for consistency
                                "name": image_name,
                                "description": image_description,
                                "bbox": bbox,
                                "format": image_ext,
                                "size_bytes": len(image_bytes),
                                "saved_path": image_path,
                                "filename": image_filename,
                                "xref": xref
                            })

                            print(f"Added image content for {image_name}")

                        except Exception as img_e:
                            print(f"Image processing error {img_index} on page {page_number}: {img_e}")

                except Exception as e:
                    print(f"Image extraction error page {page_number}: {e}")
               
                # Create final document with all content
                combined_content = "\n".join(content_parts).strip()
                if combined_content:
                    # Add summary counts to metadata
                    metadata.update({
                        "text_count": len(metadata["elements"]["text"]),
                        "table_count": len(metadata["elements"]["tables"]), 
                        "image_count": len(metadata["elements"]["images"]),
                        "total_elements": sum([
                            len(metadata["elements"]["text"]),
                            len(metadata["elements"]["tables"]),
                            len(metadata["elements"]["images"])
                        ])
                    })
                    
                    documents.append(Document(
                        page_content=combined_content,
                        metadata=metadata
                    ))
                    
                    print(f"Page {page_number} processed: {metadata['text_count']} text elements, "
                          f"{metadata['table_count']} tables, {metadata['image_count']} images")
            
            # Cleanup
            doc.close()
            plumber_doc.close()
            
            print(f"Total documents created: {len(documents)}")
            return documents
        
        except Exception as e:
            print(f"PDF processing error: {e}")
            return []



loader = HybridPDFLoader(r"C:\Users\bluea\OneDrive\Desktop\MMRAG\PDF\Test.pdf")
docs = loader.load()


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


Azure OpenAI initialized successfully
Processing PDF with 3 pages
Page 1: Found 1 images
Saved: page1_img1.png


Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


Generated description for page1_image1: page1_image1 (RAG architecture diagram  
workflow:  
User Query -> Vector DB (Vectors) -> Context Re...
Added image content for Image_1_Page_1
Page 1 processed: 171 text elements, 0 tables, 1 images
Page 2: Found 1 images
Saved: page2_img1.png
Generated description for page2_image1: page2_image1 (line chart: Popularity of RAG Over Time  
year | popularity score  
2020 | 10  
2021 |...
Added image content for Image_1_Page_2
Page 2 processed: 24 text elements, 0 tables, 1 images
Page 3: Found 0 images
Page 3 processed: 79 text elements, 1 tables, 0 images
Total documents created: 3
