In [1]:
import paperqa
from paperqa import Docs, Settings
from aviary.core import Message
from PIL import Image
from IPython.display import display
import numpy as np

In [42]:
# valid extensions include .pdf, .txt, and .html
doc_paths = ["./tests/stub_data/paper.pdf"]

# Prepare the Docs object by adding a bunch of documents
docs = Docs()
# for doc_path in doc_paths:
#     await docs.aadd(doc_path)

# Set up how we want to query the Docs object
settings = Settings()
settings.llm = "claude-3-5-sonnet-20240620" #"gpt-4o-mini"
settings.answer.answer_max_sources = 3
settings.answer.get_evidence_if_no_contexts = True
# settings.embedding = "cohere/embed-english-v3.0"

In [43]:
import os

import litellm
from llmclient import LiteLLMModel, MultipleCompletionLLMModel  

# os.environ['LITELLM_LOG'] = 'DEBUG'

question = Message(role="user", content="Look at the document available and tell me what it is about?")

img = Image.open("./003.jpg")
img.thumbnail((512, 512))
img_array = np.array(img)

question = Message.create_message(
    role="user",
    text="At what timepoint does the protein begin to disintegrate noticeably?",
    image=img_array,
    )

# llm = LiteLLMModel(name="gpt-4o-mini")
# # llm = MultipleCompletionLLMModel(name="claude-3-5-sonnet-20240620")
# assert litellm.supports_vision(model=llm.name)
# completion = await llm.achat([question])

print(len(question.content))
session = await docs.aquery(
    query=question,  # "What is a counterfactual?",
    settings=settings,
)

166013


In [44]:
print(session.answer)

Based on the image provided, I cannot definitively answer at what specific timepoint the protein begins to disintegrate noticeably. The image shows a series of protein structures at different timepoints, but without clear labels or a timescale, it is not possible to pinpoint an exact moment when disintegration becomes apparent. The structures appear to maintain their overall shape through most of the frames, with some subtle changes visible in later frames. However, without more context or quantitative data, I cannot make a precise determination about when noticeable disintegration occurs. The image alone does not provide sufficient information to answer the question with confidence.


In [3]:
import pymupdf

In [None]:
images = {}
drawings = {}
with pymupdf.open("./tests/stub_data/paper.pdf") as file:
    for i in range(file.page_count):
        page = file.load_page(i)
        imgs = page.get_images()
        draws = page.get_cdrawings()
        for j, img in enumerate(imgs):
            xref = img[0]
            image = file.extract_image(xref)["image"]
            images[f"{i + 1}:{j + 1}"] = image
        for j, draw in enumerate(draws):
            drawings[f"{i + 1}:{j + 1}"] = draw

In [None]:
import fitz  # PyMuPDF
import matplotlib.pyplot as plt
from matplotlib.patches import PathPatch
from matplotlib.path import Path

def visualize_pdf_drawings(pdf_path):
    """
    Extracts and visualizes drawings from each page of a PDF file using PyMuPDF.
    
    Args:
        pdf_path (str): The path to the PDF file.
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    for page_num, page in enumerate(pdf_document, start=1):
        drawings = page.get_drawings()
        # print(f"Page {page_num}: {len(drawings)} drawing objects extracted.")
        if (
            not drawings
            or
            len(drawings)<50
        ): continue
        fig, ax = plt.subplots(figsize=(10, 10))
        ax.set_title(f"Page {page_num} Drawings")
        ax.set_aspect('equal')
        ax.axis('off')
        # Loop through extracted drawings
        for drawing in drawings:
            if 'items' not in drawing:
                continue
            for item in drawing['items']:
                command = item[0]
                points = item[1:]

                # Handle curve commands ('c')
                if command == 'c':
                    # Draw cubic Bézier curves
                    path_data = [
                        (Path.MOVETO, (points[0].x, points[0].y)),
                        (Path.CURVE4, (points[1].x, points[1].y)),
                        (Path.CURVE4, (points[2].x, points[2].y)),
                        (Path.CURVE4, (points[3].x, points[3].y)),
                    ]
                    codes, verts = zip(*path_data)
                    path = Path(verts, codes)
                    patch = PathPatch(path, facecolor='none', edgecolor='blue', lw=1)
                    ax.add_patch(patch)

                # Handle line commands ('l')
                elif command == 'l':
                    start_point = points[0]
                    end_point = points[1]
                    ax.plot(
                        [start_point.x, end_point.x],
                        [start_point.y, end_point.y],
                        color='black', lw=1
                    )
    plt.show()

    pdf_document.close()

visualize_pdf_drawings("./tests/stub_data/paper.pdf")

In [38]:
from IPython.display import display

pdf = pymupdf.open("tests/stub_data/paper.pdf")
page = pdf[17]

# https://github.com/pymupdf/PyMuPDF/discussions/3508
bboxes = page.cluster_drawings()
for i, bbox in enumerate(bboxes):
    pix = page.get_pixmap(clip=bbox, dpi=600)
    pix_bytes = pix.tobytes()
    # display(Image(data=pix_bytes))
    pix.save(f"image_{i}.png")

In [None]:
ref = pdf[18].get_images()
print(ref)
# pdf.extract_image(ref[0])

In [None]:
import os
from pathlib import Path


def parse_pdf_images(pdf_path, output_dir):
    """
    Extracts all images from a PDF file and saves them to the specified output directory.

    Parameters:
        pdf_path (str): Path to the PDF file.
        output_dir (str): Directory where extracted images will be saved.

    Returns:
        list: A list of file paths to the extracted images.
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Open the PDF file
    pdf_document = pymupdf.open(pdf_path)
    image_paths = []

    for page_num in range(len(pdf_document)):
        # Get the current page
        page = pdf_document[page_num]
        images = page.get_images()  # Get all images on the page

        for img_index, img in enumerate(images):
            xref = img[0]  # Get the XREF of the image
            base_image = pdf_document.extract_image(xref)  # Extract the image
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]  # Image extension (e.g., png, jpg)

            # Create a filename for the image
            image_filename = f"page_{page_num + 1}_img_{img_index + 1}.{image_ext}"
            image_path = os.path.join(output_dir, image_filename)

            Path(image_path).write_bytes(image_bytes)

            image_paths.append(image_path)

    pdf_document.close()
    return image_paths


pdf_path = "./tests/stub_data/paper.pdf"
output_dir = "extracted_images"
extracted_images = parse_pdf_images(pdf_path, output_dir)
print(f"Extracted {len(extracted_images)} images:")
print("\n".join(extracted_images))


In [None]:
from IPython.display import display
from PIL import Image

for image_path in extracted_images:
    img = Image.open(image_path)
    # Make display show it smaller
    img.thumbnail((250, 250))
    display(img)
    break

In [None]:
import numpy as np
from aviary.core import encode_image_to_base64

Image.open(extracted_images[0])
image = np.array(Image.open(extracted_images[0]))
encode_image_to_base64(image)

In [None]:
from aviary.core import encode_image_to_base64

pdf = pymupdf.open("tests/stub_data/paper.pdf")
page = pdf[17]
pix = page.get_pixmap(dpi=600)
pix_bytes = pix.tobytes()

# Convert the pixmap to a numpy array
pix_array = np.frombuffer(pix.tobytes(), dtype=np.uint8)
a = encode_image_to_base64(pix_array)

# 

In [None]:
import paperqa

docs = paperqa.Docs()
await docs.aadd("tests/stub_data/paper.pdf")



In [None]:
answer = await docs.aquery("What is XAI?")

In [None]:
# print(answer)

print(answer.answer)