In [254]:
#!pip install python-dotenv
#!pip install openai
#!pip install imageio
#!pip install azure-ai-documentintelligence
#!pip install azure-core
#!pip install pymupdf

Collecting pymupdf


[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip



  Downloading PyMuPDF-1.24.7-cp311-none-win_amd64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.24.6 (from pymupdf)
  Downloading PyMuPDFb-1.24.6-py3-none-win_amd64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.24.7-cp311-none-win_amd64.whl (3.2 MB)
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/3.2 MB 330.3 kB/s eta 0:00:10
   - -------------------------------------- 0.1/3.2 MB 1.0 MB/s eta 0:00:03
   ------ --------------------------------- 0.5/3.2 MB 3.3 MB/s eta 0:00:01
   ----------- ---------------------------- 0.9/3.2 MB 4.6 MB/s eta 0:00:01
   ----------------- ---------------------- 1.4/3.2 MB 5.5 MB/s eta 0:00:01
   ---------------------- ----------------- 1.8/3.2 MB 6.5 MB/s eta 0:00:01
   ------------------------------ --------- 2.4/3.2 MB 7.4 MB/s eta 0:00:01
   ------------------------------------ --- 2.9/3.2 MB 7.8 MB/s eta 0:00:01
   -

In [264]:
from openai import AzureOpenAI
import dotenv
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest
import json
import re
import base64

dotenv.load_dotenv(override=True)

True

In [235]:
client = AzureOpenAI(
    azure_endpoint=os.getenv('AzureOpenAiEndpoint'),
    api_key=os.getenv('AzureOpenAiKey'),
    azure_deployment='gpt-4o',
    api_version='2024-02-15-preview'
)

In [236]:
documentIntelligenceClient = DocumentIntelligenceClient(
        credential=AzureKeyCredential(key=os.getenv('AzureDiKey')),
        endpoint=os.getenv('AzureDiEndpoint')
        )

In [237]:
def analyze_document(file):
    poller = documentIntelligenceClient.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(
                bytes_source=file,
            ),output_content_format="markdown"
    )
    return poller.result()

In [238]:
def split_document_to_pages(document):
    totalPages = document['pages']
    documentContent = []
    for page in totalPages:
        pageContent = {
            "pageNumber": page['pageNumber'],
            "pageContent": ' '.join([pageLineContent.content for pageLineContent in page.lines if pageLineContent is not None]),
            "identifier": None
        }
        documentContent.append(pageContent.copy())
    return documentContent



In [239]:
def document_ask(prompt,analyzedDocumentContent):
    response =client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                'role': 'user',
                'content': prompt
            },
            {
                'role': 'system',
                'content': f'You know about this file: {analyzedDocumentContent}'
            }
        ]
    )
    return response.choices[0].message.content

In [249]:
import re

def get_objects_in_document(analyzedDocumentContent_split_by_page, tables) -> list:
    totalObjects = []
    
    # Get objects(figures and images) in text
    regex_pattern = r"<!--(.*?)-->"
    for page in analyzedDocumentContent_split_by_page:
        matches = re.findall(regex_pattern, page['pageContent'], re.DOTALL)
        if matches:
            for match in matches:
                result = {
                    "pageNumber": page['pageNumber'],
                    "identifier": str(abs(hash(match)) % (10 ** 8)),
                    "object": match,
                    "type": "figure"
                }
                totalObjects.append(result.copy())
    
    # Get objects in tables
    for table in tables:
        if table.cells:
            first_cell = table.cells[0]
            if 'boundingRegions' in first_cell and first_cell['boundingRegions']:
                first_bounding_region = first_cell['boundingRegions'][0]
                if 'pageNumber' in first_bounding_region:
                    result = {
                        "pageNumber": first_bounding_region['pageNumber'],
                        "identifier": str(abs(hash(str(first_bounding_region['polygon']))) % (10 ** 8)),
                        "type": "table",
                    }
                    totalObjects.append(result.copy())
    
    return totalObjects


In [241]:
def join_content_with_identifiers(analyzedDocumentContent_split_by_page, objectsInDocument):
    for page in analyzedDocumentContent_split_by_page:
        identifiers = [obj['identifier'] for obj in objectsInDocument if obj['pageNumber'] == page['pageNumber']]
        if identifiers:
            page['identifier'] = ', '.join(identifiers)
        else:
            page['identifier'] = None
    return analyzedDocumentContent_split_by_page


In [256]:
import fitz  # PyMuPDF

def pdf_page_to_png(pdf_path, page_number, output_path, zoom_x=2.0, zoom_y=2.0):
    """
    Convert a single page from a PDF file to a high-resolution PNG file.
    
    :param pdf_path: Path to the input PDF file
    :param page_number: The page number to convert (1-based index)
    :param output_path: Path to save the output PNG file
    :param zoom_x: The zoom factor for the x-axis (default is 2.0 for high resolution)
    :param zoom_y: The zoom factor for the y-axis (default is 2.0 for high resolution)
    """
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Ensure the page number is within the valid range
    if page_number < 1 or page_number > len(pdf_document):
        raise ValueError(f"Page number {page_number} is out of range. The document has {len(pdf_document)} pages.")
    
    # Get the specified page
    page = pdf_document.load_page(page_number - 1)  # page_number is 1-based, load_page uses 0-based index
    
    # Define a transformation matrix for the zoom level
    mat = fitz.Matrix(zoom_x, zoom_y)
    
    # Render the page to a pixmap (image)
    pix = page.get_pixmap(matrix=mat)
    
    # Save the pixmap as a PNG file
    pix.save(output_path)

In [271]:
def describe_figure(encoded_image):
    encoded_image = base64.b64encode(encoded_image.read()).decode('utf-8')
    res = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": 'Describe only the pcitures or graphs you see inside the screenshot. Describe them as best as you can and include as much as information from those pictures as possible.' },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image}"}
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    return res.choices[0].message.content

def describe_table(encoded_image):
    encoded_image = base64.b64encode(encoded_image.read()).decode('utf-8')
    res = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract all content from this table and output it as json. Also create a description about the table" },
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image}"}
                    },
                ],
            }
        ],
        max_tokens=300,
    )
    return res.choices[0].message.content

In [257]:
filename = "aspirin.pdf"
with open(filename, "rb") as f:
    documentcontent = f.read()

In [243]:
documentcontent

b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(de) /StructTreeRoot 47 0 R/MarkInfo<</Marked true>>/Metadata 472 0 R/ViewerPreferences 473 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 5/Kids[ 4 0 R 24 0 R 40 0 R 42 0 R 44 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Enabled(true) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Method(Standard) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_SiteId(72f988bf-86f1-41af-91ab-2d7cd011db47) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_ContentBits(0) /Author(Christian Wunderlich) /Creator(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /CreationDate(D:20240712161205+02\'00\') /ModDate(D:20240712161205+02\'00\') /Producer(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\

In [244]:
analyzedDocumentContent = analyze_document(documentcontent)
print(analyzedDocumentContent)



In [245]:
analyzedDocumentContent_split_by_page = split_document_to_pages(analyzedDocumentContent)

In [250]:
analyzedDocumentContent_split_by_page

[{'pageNumber': 1,
  'identifier': '68411571, 37013270'},
 {'pageNumber': 2,
  'pageContent': 'Since then, aspirin has become one of the most widely researched and prescribed drugs in the world, with billions of tablets consumed every year. ### Uses and Dosage of Aspirin Aspirin has multiple uses and benefits, depending on the dose and the condition being treated. Some of the common uses of aspirin are: . Pain relief: Aspirin can reduce mild to moderate pain caused by headaches, toothaches, menstrual cramps, arthritis, sprains, and other conditions. The usual dose for pain relief is 325 to 650 mg every four to six hours, as needed. . Fever reduction: Aspirin can lower the body temperature in cases of fever caused by infections or inflammation. The usual dose for fever reduction is 325 to 650 mg every four to six hours, as needed. . Inflammation reduction: Aspirin can decrease the swelling and redness caused by inflammation, such as in rheumatoid arthritis, osteoarthritis, or bursitis. 

In [267]:

objectsInDocument = get_objects_in_document(analyzedDocumentContent_split_by_page,analyzedDocumentContent['tables'])
objectsInDocument

[{'pageNumber': 1,
  'identifier': '68411571',
  'object': ' FigureContent="ASPIRIN® BAYER dreamstime" ',
  'type': 'figure'},
 {'pageNumber': 1,
  'identifier': '37013270',
  'type': 'figure'},
 {'pageNumber': 5,
  'identifier': '11995295',
  'object': ' FigureContent="30,000,000 20,000,000 10,000,000 0 2013 2014 2015 2016 2017 2018 2019 2020 2021 Total Prescriptions Total Patients" ',
  'type': 'figure'},
 {'pageNumber': 2, 'identifier': '23382025', 'type': 'table'},
 {'pageNumber': 2, 'identifier': '42628869', 'type': 'table'},
 {'pageNumber': 3, 'identifier': '89879105', 'type': 'table'},
 {'pageNumber': 4, 'identifier': '94803773', 'type': 'table'}]

In [272]:
analyzedObjects = []
for object in objectsInDocument:
    with open(object['identifier']+'.png', "rb") as image_file:
        if object['type'] == 'figure':
            analyzedObjects.append({
                "object": object,
                "description": describe_figure(image_file)
            })
        elif object['type'] == 'table':
            analyzedObjects.append({
                "object": object,
                "description": describe_table(image_file)
            })
    analyzedObjects
        

In [273]:
analyzedObjects

[{'object': {'pageNumber': 1,
   'identifier': '68411571',
   'object': ' FigureContent="ASPIRIN® BAYER dreamstime" ',
   'type': 'figure'},
  'description': 'The screenshot contains two images related to aspirin.\n\n1. The first image (located near the top) depicts a white and green box labeled "Aspirin." The box is partially open, revealing four round white tablets inside. There are additional markings and details on the packaging, but they are not discernible in this image. At the bottom of the image, there is a blue band with text and some small icons, likely related to the image\'s source or copyright information.\n\n2. The second image (located near the middle) shows a transparent bottle labeled "Aspirin" that has been tipped over, spilling numerous white, round tablets onto the surface. The label on the bottle has text and a logo, but specific details are not readable at this level of description. The background of the image is plain and gray, highlighting the bottle and the tab

In [260]:
for object in objectsInDocument:
    pdf_path = filename
    page_number = object['pageNumber']  # Page number to convert (1-based index)
    output_path = object['identifier'] + ".png"  # Output PNG file name
    zoom_x = 3.0  # Increase zoom factor for higher resolution
    zoom_y = 3.0
    pdf_page_to_png(pdf_path, page_number, output_path, zoom_x, zoom_y)

In [252]:
join_content_with_identifiers(analyzedDocumentContent_split_by_page,objectsInDocument)

[{'pageNumber': 1,
  'identifier': '68411571, 37013270'},
 {'pageNumber': 2,
  'pageContent': 'Since then, aspirin has become one of the most widely researched and prescribed drugs in the world, with billions of tablets consumed every year. ### Uses and Dosage of Aspirin Aspirin has multiple uses and benefits, depending on the dose and the condition being treated. Some of the common uses of aspirin are: . Pain relief: Aspirin can reduce mild to moderate pain caused by headaches, toothaches, menstrual cramps, arthritis, sprains, and other conditions. The usual dose for pain relief is 325 to 650 mg every four to six hours, as needed. . Fever reduction: Aspirin can lower the body temperature in cases of fever caused by infections or inflammation. The usual dose for fever reduction is 325 to 650 mg every four to six hours, as needed. . Inflammation reduction: Aspirin can decrease the swelling and redness caused by inflammation, such as in rheumatoid arthritis, osteoarthritis, or bursitis. 

In [141]:
document_ask("How many participants are covered?",analyzedDocumentContent_split_by_page)

"The document includes information on the distribution of aspirin prescriptions for several countries. Here's the relevant data:\n\n### Distribution of Prescriptions for Countries\n- **Russia**: Population 144.4 million, 40.0 million aspirin users (27.7%)\n- **Germany**: Population 83.2 million, 35.8 million aspirin users (43.0%)\n- **France**: Population 65.3 million, 24.5 million aspirin users (37.5%)\n- **United Kingdom**: Population 66.7 million, 22.6 million aspirin users (33.9%)\n- **Italy**: Population 60.3 million, 18.7 million aspirin users (31.0%)\n- **Spain**: Population 46.9 million, 14.1 million aspirin users (30.1%)\n\nFrom this data, we can count the total number of aspirin users mentioned:\n\n40.0 (Russia) + 35.8 (Germany) + 24.5 (France) + 22.6 (UK) + 18.7 (Italy) + 14.1 (Spain) = **155.7 million aspirin users** across these six countries.\n\nIf you were referring to a specific aspect or different set of participants from the document, please provide more context."