In [61]:
#!pip install python-dotenv
#!pip install openai
#!pip install imageio
#!pip install azure-ai-documentintelligence
#!pip install azure-core
#!pip install pymupdf

In [62]:
from openai import AzureOpenAI
import dotenv
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
import base64
import fitz
import io
from PIL import Image
dotenv.load_dotenv(override=True)

True

In [63]:
client = AzureOpenAI(
    azure_endpoint=os.getenv('AzureOpenAiEndpoint'),
    api_key=os.getenv('AzureOpenAiKey'),
    azure_deployment='gpt-4o',
    api_version='2024-02-15-preview'
)

In [64]:
documentIntelligenceClient = DocumentIntelligenceClient(
        credential=AzureKeyCredential(key=os.getenv('AzureDiKey')),
        endpoint=os.getenv('AzureDiEndpoint')
        )

In [65]:
def analyze_document(file):
    poller = documentIntelligenceClient.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(
                bytes_source=file,
            ),output_content_format="markdown"
    )
    return poller.result()

In [66]:
def split_document_to_pages(document):
    totalPages = document['pages']
    documentContent = []
    for page in totalPages:
        pageContent = {
            "pageNumber": page['pageNumber'],
            "pageContent": ' '.join([pageLineContent.content for pageLineContent in page.lines if pageLineContent is not None]),
            "identifier": None
        }
        documentContent.append(pageContent.copy())
    return documentContent



In [67]:
def document_ask(prompt,analyzedDocumentContent):
    response =client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                'role': 'user',
                'content': f"Answer the question based only on the following context, which can include text, tables, and the below image. The question: {prompt}"
            },
            {
                'role': 'system',
                'content': f'You know about this {analyzedDocumentContent}'
            }
        ]
    )
    return response.choices[0].message.content

In [68]:
def get_tables_in_document(tables) -> list:
    tableTotalObjects = []    
    # Get objects in tables
    for table in tables:
        if table.cells:
            first_cell = table.cells[0]
            if 'boundingRegions' in first_cell and first_cell['boundingRegions']:
                first_bounding_region = first_cell['boundingRegions'][0]
                if 'pageNumber' in first_bounding_region:
                    result = {
                        "pageNumber": first_bounding_region['pageNumber'],
                        "identifier": str(abs(hash(str(first_bounding_region['polygon']))) % (10 ** 8)),
                        "type": "table",
                        "objectContent": table
                    }
                    tableTotalObjects.append(result.copy())
    
    return tableTotalObjects

In [69]:
def get_images_in_document(document):
    imagesTotalObjects = []
    for page_num, page in enumerate(document.pages()):
        for img_num, img in enumerate(page.get_images(full=True)):
            result = {
                "pageNumber": page_num+1,
                "identifier": str(abs(hash(str(img))) % (10 ** 8)),
                "type": "figure",
                "objectContent": img
            }
            xref = img[0]
            base_image = document.extract_image(xref)
            image = Image.open(io.BytesIO(base_image["image"]))
            image.save(f"{result['identifier']}.png")
            imagesTotalObjects.append(result.copy())
    return imagesTotalObjects

In [70]:
def join_content_with_identifiers(analyzedDocumentContent_split_by_page, objectsInDocument):
    for page in analyzedDocumentContent_split_by_page:
        identifiers = [obj['identifier'] for obj in objectsInDocument if obj['pageNumber'] == page['pageNumber']]
        if identifiers:
            page['identifier'] = ', '.join(identifiers)
        else:
            page['identifier'] = None
    return analyzedDocumentContent_split_by_page


In [71]:
def describe_figure(image_as_bytes):
    encoded_image = base64.b64encode(image_as_bytes.read()).decode('utf-8')
    res = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Describe all images or graphs which you can see inside the provided image as best as you can and include as much as information as you can see. If and image contains a coordinate system, also extract data from all axes and get axis values for each bar or line. "},
                    {
                        "type": "image_url",
                        "image_url": {"url": f"data:image/png;base64,{encoded_image}"}
                    },
                ],
            }
        ],
    )
    return res.choices[0].message.content

In [72]:
def append_objects_to_pagelist(analyzedDocumentContent_split_by_page,objectsInDocument):
    result = []
    for page in analyzedDocumentContent_split_by_page:
        for obj in objectsInDocument:
            if obj['pageNumber'] == page['pageNumber']:
                page.update(obj)
                result.append(page.copy())
    return result

In [73]:
#filename = "Osterbaan.Amy_.ScholarshipDayPoster3.pdf"
#filename = "aspirin.pdf"
#filename = "nov_real_1.pdf"
filename = "wildfire.pdf"
with open(filename, "rb") as f:
    documentcontent = f.read()

doc = fitz.open(filename)

In [74]:
documentcontent

b'%PDF-1.7\n%\xe2\xe3\xcf\xd3\n626 0 obj\n<</ByteRange [0 142 195908 239918 ]                                                          /ContactInfo()/Contents <3083016abd06092a864886f70d010702a083016aad3083016aa8020101310f300d06096086480165030402010500300b06092a864886f70d010701a0820f193082042a30820312a00302010202043863def8300d06092a864886f70d01010505003081b431143012060355040a130b456e74727573742e6e65743140303e060355040b14377777772e656e74727573742e6e65742f4350535f3230343820696e636f72702e206279207265662e20286c696d697473206c6961622e2931253023060355040b131c286329203139393920456e74727573742e6e6574204c696d69746564313330310603550403132a456e74727573742e6e65742043657274696669636174696f6e20417574686f7269747920283230343829301e170d3939313232343137353035315a170d3239303732343134313531325a3081b431143012060355040a130b456e74727573742e6e65743140303e060355040b14377777772e656e74727573742e6e65742f4350535f3230343820696e636f72702e206279207265662e20286c696d697473206c6961622e2931253023060355040b131c286329203139

In [75]:
analyzedDocumentContent = analyze_document(documentcontent)
print(analyzedDocumentContent)

{'apiVersion': '2024-02-29-preview', 'modelId': 'prebuilt-layout', 'stringIndexType': 'textElements', 'content': '<figure>\n\n![](figures/0)\n\n<!-- FigureContent="Congressional Research Service Informing the legislative debate since 1914" -->\n\n</figure>\n\n\nIN FOCUS\n\nUpdated June 1, 2023\n\n\n# Wildfire Statistics\n\nWildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for m

In [76]:
analyzedDocumentContent_split_by_page = split_document_to_pages(analyzedDocumentContent)

In [77]:
analyzedDocumentContent_split_by_page

[{'pageNumber': 1,
  'pageContent': '<!-- FigureContent="Congressional Research Service Informing the legislative debate since 1914" --> IN FOCUS Updated June 1, 2023 # Wildfire Statistics Wildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for more than 400 million acres of national parks, wildlife refuges and preserves, other public lands, and Indian reservations. Wildfire stat

In [78]:
tablesInDocument = get_tables_in_document(analyzedDocumentContent['tables'])
imagesInDocument = get_images_in_document(doc)

describedImagesInDocument = []
for image in imagesInDocument:
    with open(f"{image['identifier']}.png", "rb") as f:
        image['objectContent'] = describe_figure(f)
        describedImagesInDocument.append(image)

totalObjectsInDocument = tablesInDocument + describedImagesInDocument

In [79]:
content_index = join_content_with_identifiers(analyzedDocumentContent_split_by_page, totalObjectsInDocument)
content_index

[{'pageNumber': 1,
  'pageContent': '<!-- FigureContent="Congressional Research Service Informing the legislative debate since 1914" --> IN FOCUS Updated June 1, 2023 # Wildfire Statistics Wildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for more than 400 million acres of national parks, wildlife refuges and preserves, other public lands, and Indian reservations. Wildfire stat

Simulate an RAG Pattern

### assuming this is in the content index

In [80]:
content_index

[{'pageNumber': 1,
  'pageContent': '<!-- FigureContent="Congressional Research Service Informing the legislative debate since 1914" --> IN FOCUS Updated June 1, 2023 # Wildfire Statistics Wildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for more than 400 million acres of national parks, wildlife refuges and preserves, other public lands, and Indian reservations. Wildfire stat

### assuming this is in the object index

In [81]:
totalObjectsInDocument

[{'pageNumber': 1,
  'identifier': '36268147',
  'type': 'table',
  'objectContent': {'rowCount': 15, 'columnCount': 6, 'cells': [{'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 0, 'content': '', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.7784, 6.3451, 1.6522, 6.3451, 1.6522, 6.6506, 0.7784, 6.6506]}], 'spans': []}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 1, 'content': '2018', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.6522, 6.3451, 2.211, 6.3451, 2.211, 6.6506, 1.6522, 6.6506]}], 'spans': [{'offset': 1692, 'length': 4}], 'elements': ['/paragraphs/7']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 2, 'content': '2019', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.211, 6.3451, 2.7124, 6.3451, 2.7124, 6.6506, 2.211, 6.6506]}], 'spans': [{'offset': 1699, 'length': 4}], 'elements': ['/paragraphs/8']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 3, 'content': '2020', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.7124,

### assuming we do the orchestration here. could be semantik kernel, could be promtflow ...

In [82]:
prompt = "What is the change in wild fires from 1993 to 2022?"

In [83]:
# retrieve the chunks to answer the question.
result_content_index = []
for page in content_index:
    if page['pageNumber'] == 1:
        result_content_index.append(page)
result_content_index


[{'pageNumber': 1,
  'pageContent': '<!-- FigureContent="Congressional Research Service Informing the legislative debate since 1914" --> IN FOCUS Updated June 1, 2023 # Wildfire Statistics Wildfires are unplanned fires, including lightning-caused fires, unauthorized human-caused fires, and escaped fires from prescribed burn projects. States are responsible for responding to wildfires that begin on nonfederal (state, local, and private) lands, except for lands protected by federal agencies under cooperative agreements. The federal government is responsible for responding to wildfires that begin on federal lands. The Forest Service (FS)-within the U.S. Department of Agriculture-carries out wildfire management and response across the 193 million acres of the National Forest System (NFS). The Department of the Interior (DOI) manages wildfire response for more than 400 million acres of national parks, wildlife refuges and preserves, other public lands, and Indian reservations. Wildfire stat

In [84]:
result_object_index = []
for result in result_content_index:
    objectRefereces = result['identifier']
    if objectRefereces:
        objectRefereces = objectRefereces.split(', ')
        for object in totalObjectsInDocument:
            if object['identifier'] in objectRefereces:
                result_object_index.append(object)
result_object_index

[{'pageNumber': 1,
  'identifier': '36268147',
  'type': 'table',
  'objectContent': {'rowCount': 15, 'columnCount': 6, 'cells': [{'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 0, 'content': '', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.7784, 6.3451, 1.6522, 6.3451, 1.6522, 6.6506, 0.7784, 6.6506]}], 'spans': []}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 1, 'content': '2018', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.6522, 6.3451, 2.211, 6.3451, 2.211, 6.6506, 1.6522, 6.6506]}], 'spans': [{'offset': 1692, 'length': 4}], 'elements': ['/paragraphs/7']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 2, 'content': '2019', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.211, 6.3451, 2.7124, 6.3451, 2.7124, 6.6506, 2.211, 6.6506]}], 'spans': [{'offset': 1699, 'length': 4}], 'elements': ['/paragraphs/8']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 3, 'content': '2020', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.7124,

In [85]:
### generate system message
system_message = []
for result in result_object_index:
        system_message.append(result['objectContent'])

In [86]:
system_message = system_message + result_content_index
system_message

[{'rowCount': 15, 'columnCount': 6, 'cells': [{'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 0, 'content': '', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.7784, 6.3451, 1.6522, 6.3451, 1.6522, 6.6506, 0.7784, 6.6506]}], 'spans': []}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 1, 'content': '2018', 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.6522, 6.3451, 2.211, 6.3451, 2.211, 6.6506, 1.6522, 6.6506]}], 'spans': [{'offset': 1692, 'length': 4}], 'elements': ['/paragraphs/7']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 2, 'content': '2019', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.211, 6.3451, 2.7124, 6.3451, 2.7124, 6.6506, 2.211, 6.6506]}], 'spans': [{'offset': 1699, 'length': 4}], 'elements': ['/paragraphs/8']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 3, 'content': '2020', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.7124, 6.3451, 3.209, 6.3451, 3.209, 6.6506, 2.7124, 6.6506]}], 'spans': [{'offset': 1706,

In [87]:
document_ask(prompt,system_message)

'Based on the provided context and data, the change in the number of wildfires from 1993 to 2022 can be summarized as follows:\n\n- **In 1993:** The number of fires was approximately 75,000.\n- **In 2022:** The number of fires was approximately 65,000.\n\nSo, there is a decrease of about 10,000 wildfires from 1993 to 2022. This indicates that while the number of annual wildfires fluctuates, there has been a slight overall decrease over this period.'