In [1]:
# import libraries
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeResult
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest

from dotenv import load_dotenv


load_dotenv()

# set `<your-endpoint>` and `<your-key>` variables with the values from the Azure portal
endpoint = os.getenv("AZURE_ENDPOINT")
key = os.getenv("AZURE_KEY")

# helper functions

def get_words(page, line):
    result = []
    for word in page.words:
        if _in_span(word, line.spans):
            result.append(word)
    return result


def _in_span(word, spans):
    for span in spans:
        if word.span.offset >= span.offset and (
            word.span.offset + word.span.length
        ) <= (span.offset + span.length):
            return True
    return False


def analyze_layout():
    # sample document
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", AnalyzeDocumentRequest(url_source=formUrl
    ))

    result: AnalyzeResult = poller.result()

    if result.styles and any([style.is_handwritten for style in result.styles]):
        print("Document contains handwritten content")
    else:
        print("Document does not contain handwritten content")

    for page in result.pages:
        print(f"----Analyzing layout from page #{page.page_number}----")
        print(
            f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}"
        )

        if page.lines:
            for line_idx, line in enumerate(page.lines):
                words = get_words(page, line)
                print(
                    f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
                    f"within bounding polygon '{line.polygon}'"
                )

                for word in words:
                    print(
                        f"......Word '{word.content}' has a confidence of {word.confidence}"
                    )

        if page.selection_marks:
            for selection_mark in page.selection_marks:
                print(
                    f"Selection mark is '{selection_mark.state}' within bounding polygon "
                    f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
                )

    if result.tables:
        for table_idx, table in enumerate(result.tables):
            print(
                f"Table # {table_idx} has {table.row_count} rows and "
                f"{table.column_count} columns"
            )
            if table.bounding_regions:
                for region in table.bounding_regions:
                    print(
                        f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}"
                    )
            for cell in table.cells:
                print(
                    f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'"
                )
                if cell.bounding_regions:
                    for region in cell.bounding_regions:
                        print(
                            f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'"
                        )

    print("----------------------------------------")



In [12]:
import base64

def analyze_layout_local(local_image_path):
    document_intelligence_client = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )

    with open(local_image_path, "rb") as f:
        image_data = f.read()
        base64_image = base64.b64encode(image_data).decode("utf-8")

    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout",
        body={"base64Source": base64_image}
    )

    result: AnalyzeResult = poller.result()
    
    if result.styles and any([style.is_handwritten for style in result.styles]):
        print("Document contains handwritten content")
    else:
        print("Document does not contain handwritten content")

    for page in result.pages:
        print(f"----Analyzing layout from page #{page.page_number}----")
        print(
            f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}"
        )

        if page.lines:
            for line_idx, line in enumerate(page.lines):
                words = get_words(page, line)
                print(
                    f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
                    f"within bounding polygon '{line.polygon}'"
                )

                for word in words:
                    print(
                        f"......Word '{word.content}' has a confidence of {word.confidence}"
                    )

        if page.selection_marks:
            for selection_mark in page.selection_marks:
                print(
                    f"Selection mark is '{selection_mark.state}' within bounding polygon "
                    f"'{selection_mark.polygon}' and has a confidence of {selection_mark.confidence}"
                )

    if result.tables:
        for table_idx, table in enumerate(result.tables):
            print(
                f"Table # {table_idx} has {table.row_count} rows and "
                f"{table.column_count} columns"
            )
            if table.bounding_regions:
                for region in table.bounding_regions:
                    print(
                        f"Table # {table_idx} location on page: {region.page_number} is {region.polygon}"
                    )
            for cell in table.cells:
                print(
                    f"...Cell[{cell.row_index}][{cell.column_index}] has text '{cell.content}'"
                )
                if cell.bounding_regions:
                    for region in cell.bounding_regions:
                        print(
                            f"...content on page {region.page_number} is within bounding polygon '{region.polygon}'"
                        )
                        
    print("----------------------------------------")
    return result


In [2]:

analyze_layout()

Document does not contain handwritten content
----Analyzing layout from page #1----
Page has width: 8.5 and height: 11.0, measured with unit: LengthUnit.INCH
...Line # 0 has word count 2 and text 'UNITED STATES' within bounding polygon '[3.4695, 0.6555, 5.0216, 0.6576, 5.0214, 0.847, 3.4693, 0.845]'
......Word 'UNITED' has a confidence of 0.997
......Word 'STATES' has a confidence of 0.998
...Line # 1 has word count 4 and text 'SECURITIES AND EXCHANGE COMMISSION' within bounding polygon '[2.1754, 0.8727, 6.3155, 0.8723, 6.3155, 1.0737, 2.1754, 1.0742]'
......Word 'SECURITIES' has a confidence of 0.993
......Word 'AND' has a confidence of 0.998
......Word 'EXCHANGE' has a confidence of 0.992
......Word 'COMMISSION' has a confidence of 0.992
...Line # 2 has word count 3 and text 'Washington, D.C. 20549' within bounding polygon '[3.4515, 1.0922, 5.0382, 1.0888, 5.0386, 1.2532, 3.4518, 1.2565]'
......Word 'Washington,' has a confidence of 0.995
......Word 'D.C.' has a confidence of 0.993
.

In [13]:
result = analyze_layout_local(r'D:\D_Documents\Dataal Africa\Ops Code\InsureFlow\Backend\images\form1.jpg')

Document contains handwritten content
----Analyzing layout from page #1----
Page has width: 744.0 and height: 992.0, measured with unit: LengthUnit.PIXEL
...Line # 0 has word count 2 and text 'SOUTHERN ORACLE' within bounding polygon '[270.0, 25.0, 524.0, 16.0, 525.0, 37.0, 270.0, 46.0]'
......Word 'SOUTHERN' has a confidence of 0.992
......Word 'ORACLE' has a confidence of 0.994
...Line # 1 has word count 2 and text 'MICROINSURANCE COMPANY' within bounding polygon '[311.0, 49.0, 483.0, 43.0, 484.0, 58.0, 312.0, 65.0]'
......Word 'MICROINSURANCE' has a confidence of 0.914
......Word 'COMPANY' has a confidence of 0.989
...Line # 2 has word count 1 and text '!' within bounding polygon '[544.0, 60.0, 560.0, 60.0, 560.0, 84.0, 544.0, 84.0]'
......Word '!' has a confidence of 0.17
...Line # 3 has word count 7 and text 'TEMPORARY LOSS OF INCOME PROTECTION PROPOSAL FORM' within bounding polygon '[51.0, 112.0, 693.0, 98.0, 694.0, 120.0, 51.0, 136.0]'
......Word 'TEMPORARY' has a confidence of 

In [25]:
    
from pydantic import BaseModel
from typing import Optional

class ExtractedWord(BaseModel):
    word: str
    confidence: float
    
    
class ExtractedField(BaseModel):
    value: str
    confidence: float


class ExtractedText(BaseModel):
    text: str
    words: list[ExtractedWord]
    


class ExtractedText(BaseModel):
    text: str
    confidence: float


class AgentDetails(BaseModel):
    full_name_and_surname: ExtractedText
    agent_number: ExtractedText


class NextOfKinDetails(BaseModel):
    full_name_and_surname: ExtractedText
    contact_number: ExtractedText


class InsuredDetails(BaseModel):
    full_name_and_surname: ExtractedText
    title: ExtractedText
    id_number: ExtractedText
    contact_number: ExtractedText
    residential_address: ExtractedText
    gender: ExtractedText
    date_of_birth: ExtractedText
    email_address: Optional[ExtractedText]
    next_of_kin_details: NextOfKinDetails


class BankDetails(BaseModel):
    bank_name: ExtractedText
    account_number: ExtractedText
    branch_code: ExtractedText
    date_of_birth: ExtractedText


class InsuranceDocument(BaseModel):
    agent_details: AgentDetails
    insured_details: InsuredDetails
    bank_details: BankDetails


for page in result.pages:
        print(f"----Analyzing layout from page #{page.page_number}----")
        # print(
        #     f"Page has width: {page.width} and height: {page.height}, measured with unit: {page.unit}"
        # )

        if page.lines:
            for line_idx, line in enumerate(page.lines):
                words = get_words(page, line)
                print(
                    # f"...Line # {line_idx} has word count {len(words)} and text '{line.content}' "
                    # f"within bounding polygon '{line.polygon}'"
                    line.content
                )

                # for word in words:
                #     print(
                #         f"......Word '{word.content}' has a confidence of {word.confidence}"
                #     )

----Analyzing layout from page #1----
SOUTHERN ORACLE
MICROINSURANCE COMPANY
!
TEMPORARY LOSS OF INCOME PROTECTION PROPOSAL FORM
AGENT DETAILS
FULL NAME AND SURNAME
John Doe
AGENT NUMBER:
75 3379
INSURED DETAILS
FULL NAME AND SURNAME:
Tonderal Brandon Mutombua
TITLE:
Mr
GENDER:
Male
ID NUMBER:
64-204479
DATE OF BIRTH:
12-10-2020
CONTACT NUMBER:
0776 464 136
EMAIL ADDRESS:
brandon@gmail.com
RESIDENTIAL ADDRESS:
22 Alps Street, Chitungwiza
NEXT OF KIN DETAILS:
FULL NAME AND SURNAME:
Luke Mutombwa
CONTACT NUMBER:
0773 798 261
BANK DETAILS
BANK NAME
CBZ
DATE OF BIRTH:
12-10-25
ACCOUNT NUMBER:
25 3799
BRANCH CODE
1$5
TEMPORARY LOSS OF INCOME PROTECTION PROPOSAL FORM |REV 0 |DATE:2023/02/10 |PAGE 1 OF 9


In [29]:
def extract_fields(result):
    fields = {
        "full_name_and_surname": ["name", "surname", "full name"],
        "agent_number": ["agent number", "agent id"],
        "date_of_birth": ["date of birth", "dob"],
        # Add more fields as needed
    }

    extracted_fields = {}

    for page in result.pages:
        if page.lines:
            for line_idx, line in enumerate(page.lines):
                line_text = line.content.lower()
                words = get_words(page, line)
                for field, keywords in fields.items():
                    for keyword in keywords:
                        if keyword in line_text:
                            # Try to extract the value from the current or next line
                            if line_idx < len(page.lines) - 1:
                                next_line = page.lines[line_idx + 1]
                                next_line_text = next_line.content.strip()
                                extracted_fields[field] = ExtractedField(value=next_line_text, confidence=words[0].confidence)
                            else:
                                # If it's the last line, try to extract the value from the current line
                                value = line_text.replace(keyword, "").strip()
                                if ":" in value:
                                    value = value.split(":")[1].strip()
                                extracted_fields[field] = ExtractedField(value=value, confidence=words[0].confidence)

    return extracted_fields

In [32]:
extracted_fields = extract_fields(result)
extracted_fields

{'full_name_and_surname': ExtractedField(value='CBZ', confidence=0.963),
 'agent_number': ExtractedField(value='75 3379', confidence=0.992),
 'date_of_birth': ExtractedField(value='12-10-25', confidence=0.991)}

{'full_name_and_surname': ExtractedField(value='CBZ', confidence=0.963),
 'agent_number': ExtractedField(value='75 3379', confidence=0.992),
 'date_of_birth': ExtractedField(value='12-10-25', confidence=0.991)}