In [34]:
from datetime import date
from google import genai
from google.genai import types

from pydantic import BaseModel, Field

from dotenv import load_dotenv
load_dotenv()

client = genai.Client()


class FieldWithConfidence(BaseModel):
    value: str | date
    confidence: float

class ZimbabweNationalRegistrationCard(BaseModel):
    id_number: FieldWithConfidence
    surname: FieldWithConfidence
    first_name: FieldWithConfidence
    date_of_birth: FieldWithConfidence
    village_of_origin: Optional[FieldWithConfidence]
    place_of_birth: FieldWithConfidence
    date_of_issue: FieldWithConfidence

    class Config:
        json_encoders = {
            date: lambda v: v.strftime('%d/%m/%Y') if isinstance(v, date) else v
        }


from pydantic import BaseModel
from datetime import date
from typing import Optional
from pydantic import BaseModel
from datetime import date
from typing import Optional

class FieldWithConfidence(BaseModel):
    value: str | date
    confidence_score: float

class AgentDetails(BaseModel):
    full_name_and_surname: FieldWithConfidence
    agent_number: FieldWithConfidence

class NextOfKinDetails(BaseModel):
    full_name_and_surname: FieldWithConfidence
    contact_number: FieldWithConfidence

class InsuredDetails(BaseModel):
    full_name_and_surname: FieldWithConfidence
    title: FieldWithConfidence
    id_number: FieldWithConfidence
    contact_number: FieldWithConfidence
    residential_address: FieldWithConfidence
    gender: FieldWithConfidence
    date_of_birth: FieldWithConfidence
    email_address: Optional[FieldWithConfidence]
    next_of_kin_details: NextOfKinDetails

class BankDetails(BaseModel):
    bank_name: FieldWithConfidence
    account_number: FieldWithConfidence
    branch_code: FieldWithConfidence
    account_holder_date_of_birth: FieldWithConfidence  # renamed for clarity

class InsuranceDocument(BaseModel):
    agent_details: AgentDetails
    insured_details: InsuredDetails
    bank_details: BankDetails

    class Config:
        json_encoders = {
            date: lambda v: v.strftime('%d-%m-%Y') if isinstance(v, date) else v
        }


with open('images/form1.jpg', 'rb') as f:
    image_bytes = f.read()

response = client.models.generate_content(
    model='gemini-2.5-flash',
    contents=[
        types.Part.from_bytes(
            data=image_bytes,
            mime_type='image/jpeg',
        ),
        """You want to develop a system that can extract relevant information from insurance documents, such as:
Agent details
Insured details
Bank details
Next of kin details
The system should be able to handle various document formats and extract the information with confidence scores.
Goals
Develop a robust information extraction system
Handle different document formats (e.g., PDFs, images)
Extract relevant information with high accuracy
Provide confidence scores for extracted data
Requirements
Use a suitable OCR (Optical Character Recognition) engine
Implement a data extraction algorithm
Handle various document structures and formats
Provide confidence scores for extracted data"""
    ],
    config={
        "response_mime_type": "application/json",
        "response_schema": InsuranceDocument,
    },
)

# print(response.text)
card_info = response.parsed
print(card_info.model_dump_json(indent=4))


{
    "agent_details": {
        "full_name_and_surname": {
            "value": "John Doe",
            "confidence_score": 0.95
        },
        "agent_number": {
            "value": "7533-11-19",
            "confidence_score": 0.92
        }
    },
    "insured_details": {
        "full_name_and_surname": {
            "value": "Tonderai Brandon Mutombwa",
            "confidence_score": 0.96
        },
        "title": {
            "value": "Mr",
            "confidence_score": 0.95
        },
        "id_number": {
            "value": "6420-11-19",
            "confidence_score": 0.93
        },
        "contact_number": {
            "value": "0776-11-19",
            "confidence_score": 0.94
        },
        "residential_address": {
            "value": "2245-11-19",
            "confidence_score": 0.9
        },
        "gender": {
            "value": "Male",
            "confidence_score": 0.95
        },
        "date_of_birth": {
            "value": "1210-11-19",
 

In [None]:
# Authentication to Google API
import os
import math
from collections import Counter
from google.cloud import vision
import re

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] ='ocr_key.json'
WORD = re.compile(r"\w+")




def detect_document(path):
    """Detects document features in an image."""
    from google.cloud import vision

    client = vision.ImageAnnotatorClient()

    with open(path, "rb") as image_file:
        content = image_file.read()

    image = vision.Image(content=content)

    response = client.document_text_detection(image=image)

    # for page in response.full_text_annotation.pages:
    #     for block in page.blocks:
    #         print(f"\nBlock confidence: {block.confidence}\n")

    #         for paragraph in block.paragraphs:
    #             print("Paragraph confidence: {}".format(paragraph.confidence))

    #             for word in paragraph.words:
    #                 word_text = "".join([symbol.text for symbol in word.symbols])
    #                 print(
    #                     "Word text: {} (confidence: {})".format(
    #                         word_text, word.confidence
    #                     )
    #                 )

    #                 for symbol in word.symbols:
    #                     print(
    #                         "\tSymbol: {} (confidence: {})".format(
    #                             symbol.text, symbol.confidence
    #                         )
    #                     )
    
    return response

    if response.error.message:
        raise Exception(
            "{}\nFor more info on error messages, check: "
            "https://cloud.google.com/apis/design/errors".format(response.error.message)
        )


In [18]:
id1 = detect_document('images/id1.jpg')
id2 = detect_document('images/id2.jpg')
id3 = detect_document('images/id3.jpg')
id4 = detect_document('images/id4.jpg')
id5 = detect_document('images/id5.jpg')
form1 = detect_document('images/form1.jpg')


In [31]:
import math
from collections import defaultdict

def reconstruct_lines_from_words(full_text_annotation):
    """
    Reconstructs lines of text based on word bounding box coordinates.

    Args:
        full_text_annotation (google.cloud.vision_v1.types.TextAnnotation):
            The full_text_annotation object from the Vision API response.

    Returns:
        list[str]: A list of strings, where each string represents a line of text.
    """
    if not full_text_annotation or not full_text_annotation.pages:
        return []

    # Get all words and their bounding box information
    words_with_coords = []
    for page in full_text_annotation.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    word_text = "".join([symbol.text for symbol in word.symbols])
                    # Get the top-left corner of the word's bounding box
                    x_coord = word.bounding_box.vertices[0].x
                    y_coord = word.bounding_box.vertices[0].y
                    words_with_coords.append({'text': word_text, 'x': x_coord, 'y': y_coord})

    # Sort words primarily by y-coordinate (to group lines) and secondarily by x-coordinate (for word order)
    words_with_coords.sort(key=lambda w: (w['y'], w['x']))

    # Group words into lines based on their y-coordinate
    lines = defaultdict(list)
    line_threshold = 2  # A small tolerance for y-coordinates to consider words on the same line
    if words_with_coords:
        current_line_y = words_with_coords[0]['y']
        for word in words_with_coords:
            # If the current word is far from the previous word's line, start a new line
            if abs(word['y'] - current_line_y) > line_threshold:
                current_line_y = word['y']
            lines[current_line_y].append(word)

    # Combine words in each group into a single line string, maintaining x-order
    reconstructed_lines = []
    for y_coord in sorted(lines.keys()):
        line_words = sorted(lines[y_coord], key=lambda w: w['x'])
        reconstructed_line_text = " ".join([word['text'] for word in line_words])
        reconstructed_lines.append(reconstructed_line_text)
    
    return reconstructed_lines


reconstructed_lines = reconstruct_lines_from_words(id2.full_text_annotation)

print("Reconstructed Lines:")
for line in reconstructed_lines:
    print(line)

Reconstructed Lines:
NATIONAL REGISTRATION
42-212079 T 42 CITF
MAKONI
WINNIE C
02/04/1987
CHUNYA
PMAKON
20/08/2014


In [32]:
form1

text_annotations {
  locale: "en"
  description: "SOUTHERN ORACLE\nMICROINSURANCE COMPANY\nTEMPORARY LOSS OF INCOME PROTECTION PROPOSAL FORM\nAGENT DETAILS\nFULL NAME AND SURNAME: John\nAGENT NUMBER:\nDoe\n75 3379\nINSURED DETAILS\nFULL NAME AND SURNAME: Tonderai Brandon Mutombwa\nTITLE\nID NUMBER:\nCONTACT NUMBER\nRESIDENTIAL ADDRESS:\nNEXT OF KIN DETAILS:\nMe\nGENDER:\nMale\n64-2044 79\nDATE OF BIRTH:\n0776 464 136\n12-10-2020\nEMAIL ADDRESS: brandon@gmail.com\n22 Alps Street, Chitungwiza\nFULL NAME AND SURNAME: Luke\nCONTACT NUMBER\nMutombwa\n0773798 261\nBANK DETAILS\nBANK NAME\nCB2\nDATE OF BIRTH:\n12-10-25\nACCOUNT NUMBER\n253799\nBRANCH CODE\n155\nTEMPORARY LOSS OF INCOME PROTECTION PROPOSAL FORM REV 0 DATE 2023/02/10 PAGE 1 OF 9"
  bounding_poly {
    vertices {
      x: 20
      y: 18
    }
    vertices {
      x: 704
      y: 18
    }
    vertices {
      x: 704
      y: 962
    }
    vertices {
      x: 20
      y: 962
    }
  }
}
text_annotations {
  description: "SOUTHERN"