In [29]:
#!pip install python-dotenv
#!pip install openai
#!pip install imageio
#!pip install azure-ai-documentintelligence
#!pip install azure-core

In [30]:
from openai import AzureOpenAI
import dotenv
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest
import json

dotenv.load_dotenv(override=True)

True

In [31]:
client = AzureOpenAI(
    azure_endpoint=os.getenv('AzureOpenAiEndpoint'),
    api_key=os.getenv('AzureOpenAiKey'),
    azure_deployment='gpt-4o',
    api_version='2024-02-15-preview'
)

In [32]:
documentIntelligenceClient = DocumentIntelligenceClient(
        credential=AzureKeyCredential(key=os.getenv('AzureDiKey')),
        endpoint=os.getenv('AzureDiEndpoint')
        )

In [60]:
def analyze_document(file):
    poller = documentIntelligenceClient.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(
                bytes_source=file,
            ),output_content_format="markdown"
    )
    return poller.result()

In [37]:
def split_document_to_pages(document):
    totalPages = document['pages']
    documentContent = []
    for page in totalPages:
        pageContent = {
            "pageNumber": page['pageNumber'],
            "pageContent": ' '.join([pageLineContent.content for pageLineContent in page.lines if pageLineContent is not None])
        }
        documentContent.append(pageContent.copy())
    return documentContent



In [40]:
def document_ask(prompt,analyzedDocumentContent):
    response =client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                'role': 'user',
                'content': prompt
            },
            {
                'role': 'system',
                'content': f'You know about this file: {analyzedDocumentContent}'
            }
        ]
    )
    return response.choices[0].message.content

### Aspirin File

In [33]:
with open("aspirin.pdf", "rb") as f:
    aspirin_file = f.read()

In [34]:
aspirin_file

b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(de) /StructTreeRoot 46 0 R/MarkInfo<</Marked true>>/Metadata 466 0 R/ViewerPreferences 467 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 5/Kids[ 4 0 R 23 0 R 39 0 R 41 0 R 43 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Enabled(true) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Method(Standard) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_SiteId(72f988bf-86f1-41af-91ab-2d7cd011db47) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_ContentBits(0) /Author(Christian Wunderlich) /Creator(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /CreationDate(D:20240711163939+02\'00\') /ModDate(D:20240711163939+02\'00\') /Producer(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\

In [48]:
aspirin_file_analyzed = analyze_document(aspirin_file)
print(aspirin_file_analyzed)

KeyboardInterrupt: 

In [38]:
aspirin_file_per_page = split_document_to_pages(aspirin_file_analyzed)

In [39]:
aspirin_file_per_page

[{'pageNumber': 1,
 {'pageNumber': 2,
  'pageContent': '## Uses and Dosage of Aspirin Aspirin has multiple uses and benefits, depending on the dose and the condition being treated. Some of the common uses of aspirin are: . Pain relief: Aspirin can reduce mild to moderate pain caused by headaches, toothaches, menstrual cramps, arthritis, sprains, and other conditions. The usual dose for pain relief is 325 to 650 mg every four to six hours, as needed. . Fever reduction: Aspirin can lower the body temperature in cases of fever caused by infections or inflammation. The usual dose for fever reduction is 325 to 650 mg every four to six hours, as needed. . Inflammation reduction: Aspirin can decrease the swelling and redness caused by inflammation, such as in rheumatoid arthritis, osteoarthritis, or bursitis. The usual dose for inflammation reduction is 650 to 1000 mg every four to six hours, as needed. . Blood clot prevention: Aspirin can prevent the formation of blood clots that can cause h

In [43]:
document_ask("What is the average percentage of aspirin prescriptions for all countires?",aspirin_file_per_page)

'Based on the data provided in the document, the percentages of aspirin prescriptions for various countries are as follows:\n\n- Russia: 27.7%\n- Germany: 43.0%\n- France: 37.5%\n- United Kingdom: 33.9%\n- Italy: 31.0%\n- Spain: 30.1%\n\nTo find the average percentage of aspirin prescriptions for all these countries, we calculate the mean of these values:\n\n\\[ \\text{Average} = \\frac{27.7 + 43.0 + 37.5 + 33.9 + 31.0 + 30.1}{6} \\]\n\n\\[ \\text{Average} = \\frac{203.2}{6} \\]\n\n\\[ \\text{Average} \\approx 33.87\\% \\]\n\nTherefore, the average percentage of aspirin prescriptions across these six countries is approximately 33.87%.'

In [44]:
documentContent = aspirin_file_per_page + aspirin_file_analyzed.tables

In [45]:
documentContent

[{'pageNumber': 1,
 {'pageNumber': 2,
  'pageContent': '## Uses and Dosage of Aspirin Aspirin has multiple uses and benefits, depending on the dose and the condition being treated. Some of the common uses of aspirin are: . Pain relief: Aspirin can reduce mild to moderate pain caused by headaches, toothaches, menstrual cramps, arthritis, sprains, and other conditions. The usual dose for pain relief is 325 to 650 mg every four to six hours, as needed. . Fever reduction: Aspirin can lower the body temperature in cases of fever caused by infections or inflammation. The usual dose for fever reduction is 325 to 650 mg every four to six hours, as needed. . Inflammation reduction: Aspirin can decrease the swelling and redness caused by inflammation, such as in rheumatoid arthritis, osteoarthritis, or bursitis. The usual dose for inflammation reduction is 650 to 1000 mg every four to six hours, as needed. . Blood clot prevention: Aspirin can prevent the formation of blood clots that can cause h

In [46]:
document_ask("What is the average percentage of aspirin prescriptions for all countires?",documentContent)

'To calculate the average percentage of aspirin prescriptions for all the countries listed, you need to first add up the percentages and then divide by the number of countries.\n\nHere are the percentages of aspirin prescriptions for each country:\n\n- Russia: 27.7%\n- Germany: 43.0%\n- France: 37.5%\n- United Kingdom: 33.9%\n- Italy: 31.0%\n- Spain: 30.1%\n\n### Step-by-Step Calculation:\n1. Add the percentages together:\n   \\[\n   27.7 + 43.0 + 37.5 + 33.9 + 31.0 + 30.1 = 203.2\n   \\]\n\n2. Divide by the number of countries (which is 6):\n   \\[\n   \\frac{203.2}{6} \\approx 33.87\n   \\]\n\nSo, the average percentage of aspirin prescriptions for all the countries listed is approximately **33.87%**.'

### MSFT Report

In [49]:
with open("msft.pdf", "rb") as f:
    msft_file = f.read()

In [52]:
msft_file

b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(de) /StructTreeRoot 31 0 R/MarkInfo<</Marked true>>/Metadata 2999 0 R/ViewerPreferences 3000 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 1/Kids[ 4 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Enabled(true) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Method(Standard) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_SiteId(72f988bf-86f1-41af-91ab-2d7cd011db47) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_ContentBits(0) /Producer(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /Creator(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /CreationDate(D:20240712123751+02\'00\') /ModDate(D:20240712123751+02\'0

In [61]:
msft_file_analyzed = analyze_document(msft_file)
print(msft_file_analyzed)

{'apiVersion': '2024-02-29-preview', 'modelId': 'prebuilt-layout', 'stringIndexType': 'textElements', 'content': 'NOTE 4 - INVESTMENTS\n===\n\n\n# Investment Components\n\nThe components of investments were as follows:\n\n| (In millions) | Fair Value Level || Adjusted Unrealized Cost Basis  Gains | Unrealized Losses | Recorded Basis | Cash and Cash|| Equivalents Investments Investments  Short-term  Equity |\n| - | - | - | - | - | - | - | - | - |\n| June 30, 2023 | | | | | | | | |\n| Changes in Fair Value Recorded in Other Comprehensive Income || | | | | | | |\n| Commercial paper | Level 2 | $ 16,589 | $ 0 | $ 0 | $ 16,589 | $ 12,231 | $ 4,358 | $ 0 |\n| Certificates of deposit | Level 2 | 2,701 | 0 | 0 | 2,701 | 2,657 | 44 | 0 |\n| U.S. government securities | Level 1 | 65,237 | 2 | (3,870) | 61,369 | 2,991 | 58,378 | 0 |\n| U.S. agency securities :unselected: | Level 2 | 2,703 | 0 | 0 | 2,703 | 894 | 1,809 | 0 |\n| Foreign government bonds | Level 2 | 498 | 1 | (24) | 475 | 0 | 475 | 

In [62]:
msft_file_per_page = split_document_to_pages(msft_file_analyzed)

In [63]:
msft_file_per_page

[{'pageNumber': 1,
  'pageContent': 'NOTE 4 - INVESTMENTS\n=== # Investment Components The components of investments were as follows: (In millions) Fair Value Level Adjusted Unrealized Cost Basis Gains Unrealized Losses Recorded Basis Cash and Cash Equivalents Investments Investments Short-term Equity June 30, 2023 Changes in Fair Value Recorded in Other Comprehensive Income Commercial paper Level 2 $ 16,589 $ 0 $ 0 $ 16,589 $ 12,231 $ 4,358 $ 0 Certificates of deposit Level 2 2,701 0 0 2,701 2,657 44 0 U.S. government securities Level 1 65,237 2 (3,870) 61,369 2,991 58,378 0 U.S. agency securities Level 2 2,703 0 0 2,703 894 1,809 0 Foreign government bonds Level 2 498 1 (24) 475 0 475 0 Mortgage- and asset- backed securities Level 2 824 1 (39) 786 0 786 0 Corporate notes and bonds Level 2 10,809 8 (583) 10,234 0 10,234 0 Corporate notes and bonds Level 3 120 0 0 120 0 120 0 Municipal securities Level 2 285 1 (18) 268 7 261 0 Municipal securities Level 3 103 0 (16) 87 0 87 0 Total deb

In [64]:
document_ask("What is the Unrealized Gains for Municipal securities??",msft_file_per_page)

'To determine the Unrealized Gains for Municipal securities, you can refer to the provided investment data:\n\n### Municipal Securities (June 30, 2023):\n- **Level 2 Municipal Securities:**\n  - Fair Value: $285 million\n  - Unrealized Gains: $1 million\n  - Unrealized Losses: $18 million\n  - Recorded Basis: $268 million\n\n- **Level 3 Municipal Securities:**\n  - Fair Value: $103 million\n  - Unrealized Gains: $0 million\n  - Unrealized Losses: $16 million\n  - Recorded Basis: $87 million\n\n### Summary:\n- **Total Unrealized Gains for Municipal Securities:** $1 million (contributed by Level 2 Municipal Securities as there are no gains in Level 3).\n\nSo, the Unrealized Gains for Municipal securities as of June 30, 2023, are **$1 million**.'

As you can see, the outcome is wrong.

![](result_table_1_wrong.png)

In [66]:
document_ask("What is the Equity investments of Level 1 for Recorded Basis of 2023?",msft_file_per_page)

'The Equity investments of Level 1 for Recorded Basis of 2023 are $2,692 million.\n\nThis information can be found in the line item for Level 1 Equity investments under "Changes in Fair Value Recorded in Net Income."'