In [15]:
#!pip install python-dotenv
#!pip install openai
#!pip install imageio
#!pip install azure-ai-documentintelligence
#!pip install azure-core

In [16]:
from openai import AzureOpenAI
import dotenv
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult, AnalyzeDocumentRequest
import json

dotenv.load_dotenv(override=True)

True

In [17]:
client = AzureOpenAI(
    azure_endpoint=os.getenv('AzureOpenAiEndpoint'),
    api_key=os.getenv('AzureOpenAiKey'),
    azure_deployment='gpt-4o',
    api_version='2024-02-15-preview'
)

In [18]:
documentIntelligenceClient = DocumentIntelligenceClient(
        credential=AzureKeyCredential(key=os.getenv('AzureDiKey')),
        endpoint=os.getenv('AzureDiEndpoint')
        )

In [19]:
def analyze_document(file):
    poller = documentIntelligenceClient.begin_analyze_document(
            "prebuilt-layout", AnalyzeDocumentRequest(
                bytes_source=file,
            ),output_content_format="markdown"
    )
    return poller.result()

In [20]:
def split_document_to_pages(document):
    totalPages = document['pages']
    documentContent = []
    for page in totalPages:
        pageContent = {
            "pageNumber": page['pageNumber'],
            "pageContent": ' '.join([pageLineContent.content for pageLineContent in page.lines if pageLineContent is not None])
        }
        documentContent.append(pageContent.copy())
    return documentContent



In [21]:
def document_ask(prompt,analyzedDocumentContent):
    response =client.chat.completions.create(
        model='gpt-4o',
        messages=[
            {
                'role': 'user',
                'content': prompt
            },
            {
                'role': 'system',
                'content': f'You know about this file: {analyzedDocumentContent}'
            }
        ]
    )
    return response.choices[0].message.content

### MSFT Report

In [22]:
with open("msft.pdf", "rb") as f:
    msft_file = f.read()

In [23]:
msft_file

b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(de) /StructTreeRoot 31 0 R/MarkInfo<</Marked true>>/Metadata 2999 0 R/ViewerPreferences 3000 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 1/Kids[ 4 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Enabled(true) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_Method(Standard) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_SiteId(72f988bf-86f1-41af-91ab-2d7cd011db47) /MSIP_Label_f42aa342-8706-4288-bd11-ebb85995028c_ContentBits(0) /Producer(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /Creator(\xfe\xff\x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00\xae\x00 \x00W\x00o\x00r\x00d\x00 \x00f\x00o\x00r\x00 \x00M\x00i\x00c\x00r\x00o\x00s\x00o\x00f\x00t\x00 \x003\x006\x005) /CreationDate(D:20240712123751+02\'00\') /ModDate(D:20240712123751+02\'0

In [24]:
msft_file_analyzed = analyze_document(msft_file)
print(msft_file_analyzed)

{'apiVersion': '2024-02-29-preview', 'modelId': 'prebuilt-layout', 'stringIndexType': 'textElements', 'content': 'NOTE 4 - INVESTMENTS\n===\n\n\n# Investment Components\n\nThe components of investments were as follows:\n\n| (In millions) | Fair Value Level || Adjusted Unrealized Cost Basis  Gains | Unrealized Losses | Recorded Basis | Cash and Cash|| Equivalents Investments Investments  Short-term  Equity |\n| - | - | - | - | - | - | - | - | - |\n| June 30, 2023 | | | | | | | | |\n| Changes in Fair Value Recorded in Other Comprehensive Income || | | | | | | |\n| Commercial paper | Level 2 | $ 16,589 | $ 0 | $ 0 | $ 16,589 | $ 12,231 | $ 4,358 | $ 0 |\n| Certificates of deposit | Level 2 | 2,701 | 0 | 0 | 2,701 | 2,657 | 44 | 0 |\n| U.S. government securities | Level 1 | 65,237 | 2 | (3,870) | 61,369 | 2,991 | 58,378 | 0 |\n| U.S. agency securities :unselected: | Level 2 | 2,703 | 0 | 0 | 2,703 | 894 | 1,809 | 0 |\n| Foreign government bonds | Level 2 | 498 | 1 | (24) | 475 | 0 | 475 | 

In [25]:
msft_file_per_page = split_document_to_pages(msft_file_analyzed)

In [26]:
msft_file_per_page

[{'pageNumber': 1,
  'pageContent': 'NOTE 4 - INVESTMENTS\n=== # Investment Components The components of investments were as follows: (In millions) Fair Value Level Adjusted Unrealized Cost Basis Gains Unrealized Losses Recorded Basis Cash and Cash Equivalents Investments Investments Short-term Equity June 30, 2023 Changes in Fair Value Recorded in Other Comprehensive Income Commercial paper Level 2 $ 16,589 $ 0 $ 0 $ 16,589 $ 12,231 $ 4,358 $ 0 Certificates of deposit Level 2 2,701 0 0 2,701 2,657 44 0 U.S. government securities Level 1 65,237 2 (3,870) 61,369 2,991 58,378 0 U.S. agency securities Level 2 2,703 0 0 2,703 894 1,809 0 Foreign government bonds Level 2 498 1 (24) 475 0 475 0 Mortgage- and asset- backed securities Level 2 824 1 (39) 786 0 786 0 Corporate notes and bonds Level 2 10,809 8 (583) 10,234 0 10,234 0 Corporate notes and bonds Level 3 120 0 0 120 0 120 0 Municipal securities Level 2 285 1 (18) 268 7 261 0 Municipal securities Level 3 103 0 (16) 87 0 87 0 Total deb

In [35]:
document_ask("Provide the exact number of recorded basis for equity investments of level 1 in 2023.",msft_file_per_page)

'The recorded basis for Level 1 equity investments in 2023 is $2,692 million.'

In [28]:
document_ask("Provide the exact number of recorded basis for equity investments of level 1 in 2022.",msft_file_per_page)

'For the year ending June 30, 2022, the recorded basis for Level 1 equity investments was $456 million.'

In [29]:
document_ask("Provide the recorded basis for Corporate notes and bonds of Level 3 in 2023.",msft_file_per_page)

'The recorded basis for Corporate notes and bonds of Level 3 in 2023 is $120 million.'

In [30]:
document_ask("Provide the recorded basis for Corporate notes and bonds of Level 3 in 2022.",msft_file_per_page)

"As of June 30, 2022, the recorded basis for Corporate notes and bonds classified as Level 3 investments was $67 million. There were no adjusted unrealized gains or losses associated with these investments.\n\nHere's a summary for quick reference:\n\n- **Corporate notes and bonds (Level 3) in 2022:**\n  - Fair Value Level: Level 3\n  - Adjusted Unrealized Gains: $0\n  - Adjusted Unrealized Losses: $0\n  - Recorded Basis: $67 million"

In [33]:
document_ask("Provide the total number for unrealized gains of total debt investments for 2023 and 2022",msft_file_per_page)

'Based on the provided content, the unrealized gains for total debt investments can be summarized for 2023 and 2022 as follows:\n\nFor the year ending June 30, 2023:\n- Total unrealized gains for debt investments: \\( \\$13 \\) million.\n\nFor the year ending June 30, 2022:\n- Total unrealized gains for debt investments: \\( \\$53 \\) million.\n\nTherefore, the total unrealized gains for total debt investments are \\( \\$13 \\) million for 2023 and \\( \\$53 \\) million for 2022.'

**Extra-curriculum:**

In [36]:
table = msft_file_analyzed.tables

In [37]:
table

[{'rowCount': 22, 'columnCount': 9, 'cells': [{'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 0, 'content': '(In millions)', 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.4137, 1.294, 2.0871, 1.294, 2.0871, 1.7066, 0.4137, 1.7066]}], 'spans': [{'offset': 102, 'length': 13}], 'elements': ['/paragraphs/3']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 1, 'content': 'Fair Value Level', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.0871, 1.294, 2.7876, 1.294, 2.7876, 1.7066, 2.0871, 1.7066]}], 'spans': [{'offset': 118, 'length': 16}], 'elements': ['/paragraphs/4']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 2, 'content': 'Adjusted\nCost Basis', 'boundingRegions': [{'pageNumber': 1, 'polygon': [2.7876, 1.294, 3.5114, 1.294, 3.5114, 1.7066, 2.7876, 1.7066]}], 'spans': [{'offset': 138, 'length': 8}, {'offset': 158, 'length': 10}], 'elements': ['/paragraphs/5']}, {'kind': 'columnHeader', 'rowIndex': 0, 'columnIndex': 3, 'content': 'Unrealized\nGains',

In [44]:
document_ask("Provide the exact number of recorded basis for equity investments of level 1 in 2023.",table)

'The exact number of recorded basis for equity investments of Level 1 in 2023 is \\$10,138 million.'

In [45]:
document_ask("Provide the exact number of recorded basis for equity investments of level 1 in 2022.",table)

'The exact number of recorded bases for equity investments classified as Level 1 in 2022 is 1,590 (expressed in millions of dollars).\n\nThis information can be found in the provided table, specifically in the row labeled "Equity investments" under the "Changes in Fair Value Recorded in Net Income" section and the "Level 1" column for June 30, 2022.'

In [46]:
document_ask("Provide the recorded basis for Corporate notes and bonds of Level 3 in 2023.",table)

'In 2023, the recorded basis for "Corporate notes and bonds" classified under Level 3 was $120 million.'

In [47]:
document_ask("Provide the recorded basis for Corporate notes and bonds of Level 3 in 2022.",table)

'In 2022, the recorded basis for Corporate notes and bonds of Level 3 was $120 million.'

In [48]:
document_ask("Provide the total number for unrealized gains of total debt investments for 2023 and 2022",table)

'The specific unrealized gains for total debt investments for the years 2023 and 2022 are as follows:\n\n- **2023:**\n  - Unrealized Gains: \\$13 million\n\n- **2022:**\n  - Unrealized Gains: \\$53 million\n\nThis data can be found in the "Unrealized Gains" column for "Total debt investments" in the respective years of the provided table.'