# An agent from langchain that can analyze images and text

Made with inspiration from this source code from langchain:

https://python.langchain.com/docs/how_to/document_loader_pdf/#use-of-multimodal-models

In [None]:
# Install the necessary libraries
# uncomment the following lines to install the necessary libraries

# pip install fitz pillow

In [None]:
import base64
import io
import fitz
from PIL import Image
from IPython.display import Image as IPImage, display
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage

def pdf_page_to_base64(pdf_path: str, page_number: int):
    pdf_document = fitz.open(pdf_path)
    page = pdf_document.load_page(page_number - 1)  # input is one-indexed
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    buffer = io.BytesIO()
    img.save(buffer, format="PNG")

    return base64.b64encode(buffer.getvalue()).decode("utf-8")

def get_pdf_content(pdf_path: str) -> dict:
    pdf_document = fitz.open(pdf_path)
    text = []
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        text.append(page.get_text())
    return text

# Get the content of the PDF file
pdf_text = get_pdf_content("../../data/test_with_images.pdf")

# Path to the PDF file
pdf_path = "../../data/test_with_images.pdf"

# Initialize the multimodal model
llm = ChatOpenAI(model="gpt-4o-mini")

# Iterate over each page and analyze content
for page_num, page_text in enumerate(pdf_text, start=1):
    # Convert the page to a base64 image
    base64_image = pdf_page_to_base64(pdf_path, page_num)
    display(IPImage(data=base64.b64decode(base64_image)))

    # Define the query
    query = f"Analyze the content of page {page_num}"

    # Create the message with text and image
    message = HumanMessage(
        content=[
            {"type": "text", "text": query},
            {"type": "text", "text": page_text},
            {
                "type": "image_url",
                "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"},
            },
        ],
    )
    # Query the model
    response = llm.invoke([message])
    print(f"Page {page_num} analysis:\n{response.content}\n")