This script does the following:

Converts each PDF page to an image.
Sends each image to the Azure Document Intelligence API for text extraction.
Collects and saves the extracted text to a .txt file.

In [8]:
import requests
from pdf2image import convert_from_path
import time

# Azure Document Intelligence endpoint and API key
endpoint = "https://cursive-handwritings.cognitiveservices.azure.com"  # Replace with your endpoint
api_key = ""  # Replace with your API key

# Headers for the API request
headers = {
    "Ocp-Apim-Subscription-Key": api_key,
    "Content-Type": "application/octet-stream"
}

# Function to analyze handwritten text in an image
def analyze_handwriting(image_bytes):
    url = f"{endpoint}/vision/v3.2/read/analyze"  # Endpoint for the 'read' API
    response = requests.post(url, headers=headers, data=image_bytes)
    response.raise_for_status()
    
    # Retrieve the operation location for polling
    operation_location = response.headers["Operation-Location"]
    
    # Poll the operation location to check if processing is complete
    while True:
        result_response = requests.get(operation_location, headers={"Ocp-Apim-Subscription-Key": api_key})
        result = result_response.json()
        
        if result.get("status") == "succeeded":
            return result
        elif result.get("status") == "failed":
            raise Exception("Handwriting analysis failed.")
        
        time.sleep(1)  # Wait a second before polling again

# Convert PDF to images and perform OCR on each page
pdf_path = "Example 1.pdf"  # Replace with your PDF file path
output_file = "handwritten_text_results.txt"

with open(output_file, "w", encoding="utf-8") as file:
    pages = convert_from_path(pdf_path, dpi=300)
    
    for page_num, page in enumerate(pages, start=1):
        # Convert page image to bytes
        image_bytes = page.tobytes()
        
        # Analyze the handwriting in the image
        result = analyze_handwriting(image_bytes)
        
        # Extract text from the JSON response
        text = ""
        for read_result in result["analyzeResult"]["readResults"]:
            for line in read_result["lines"]:
                text += line["text"] + "\n"
        
        # Write the text to the output file
        file.write(f"Text from page {page_num}:\n{text}\n{'-'*80}\n")
        
        print(f"Text from page {page_num} saved.")

print(f"OCR results saved to {output_file}")

HTTPError: 401 Client Error: Access Denied for url: https://cursive-handwritings.cognitiveservices.azure.com/vision/v3.2/read/analyze

In [None]:
"""
This code sample shows Prebuilt Read operations with the Azure Form Recognizer client library. 
The async versions of the samples require Python 3.6 or later.

To learn more, please visit the documentation - Quickstart: Document Intelligence (formerly Form Recognizer) SDKs
https://learn.microsoft.com/azure/ai-services/document-intelligence/quickstarts/get-started-sdks-rest-api?pivots=programming-language-python
"""

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient

"""
Remember to remove the key from your code when you're done, and never post it publicly. For production, use
secure methods to store and access your credentials. For more information, see 
https://docs.microsoft.com/en-us/azure/cognitive-services/cognitive-services-security?tabs=command-line%2Ccsharp#environment-variables-and-application-configuration
"""
endpoint = "YOUR_FORM_RECOGNIZER_ENDPOINT"
key = "YOUR_FORM_RECOGNIZER_KEY"

def format_bounding_box(bounding_box):
    if not bounding_box:
        return "N/A"
    return ", ".join(["[{}, {}]".format(p.x, p.y) for p in bounding_box])

def analyze_read():
    # sample document
    formUrl = "https://raw.githubusercontent.com/Azure-Samples/cognitive-services-REST-api-samples/master/curl/form-recognizer/sample-layout.pdf"

    document_analysis_client = DocumentAnalysisClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    
    poller = document_analysis_client.begin_analyze_document_from_url(
            "prebuilt-read", formUrl)
    result = poller.result()

    print ("Document contains content: ", result.content)
    
    for idx, style in enumerate(result.styles):
        print(
            "Document contains {} content".format(
                "handwritten" if style.is_handwritten else "no handwritten"
            )
        )

    for page in result.pages:
        print("----Analyzing Read from page #{}----".format(page.page_number))
        print(
            "Page has width: {} and height: {}, measured with unit: {}".format(
                page.width, page.height, page.unit
            )
        )

        for line_idx, line in enumerate(page.lines):
            print(
                "...Line # {} has text content '{}' within bounding box '{}'".format(
                    line_idx,
                    line.content,
                    format_bounding_box(line.polygon),
                )
            )

        for word in page.words:
            print(
                "...Word '{}' has a confidence of {}".format(
                    word.content, word.confidence
                )
            )

    print("----------------------------------------")


if __name__ == "__main__":
    analyze_read()


In [27]:
from fuzzywuzzy import fuzz, process

# Sample dictionary of words you're interested in
# target_words = ["Hampshire", "Sheriff", "Alexander", "Court", "Goods", "Common Pleas", "address"]

target_words = ["address"]

# Function to read OCR text from a file
def load_text_from_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        return file.read()

# Function to search for target words with strict fuzzy matching
def search_words_in_text(ocr_text, target_words, threshold):
    found_words = {}
    
    # Split the OCR text into paragraphs or sections based on your separator
    paragraphs = ocr_text.split("==========")
    
    # Process each paragraph
    for i, paragraph in enumerate(paragraphs, start=1):
        paragraph = paragraph.strip()
        found_words[i] = []
        
        # Check each word in the dictionary
        for word in target_words:
            # Find close matches in the paragraph with a strict threshold
            matches = process.extractBests(word, paragraph.split(), score_cutoff=threshold, limit=5)
            
            # Save matches that meet the strict threshold
            for match in matches:
                found_words[i].append((match[0], match[1]))
    
    return found_words

# Load the OCR text from a .txt file
ocr_text = load_text_from_file("document_intelligence_example1.txt")  # Replace with your actual file path

# Perform the search with a stricter threshold
results = search_words_in_text(ocr_text, target_words, threshold=90)

# Display results
for paragraph_num, matches in results.items():
    print(f"Paragraph {paragraph_num}:")
    for match in matches:
        print(f"  Found '{match[0]}' (similarity: {match[1]})")
    print("\n" + "-"*30 + "\n")


Paragraph 1:
  Found '/s:' (similarity: 90)

------------------------------

Paragraph 2:

------------------------------

Paragraph 3:

------------------------------

Paragraph 4:

------------------------------

Paragraph 5:
  Found 'a' (similarity: 90)

------------------------------

Paragraph 6:

------------------------------

Paragraph 7:

------------------------------

Paragraph 8:
  Found 'A' (similarity: 90)
  Found 'a' (similarity: 90)

------------------------------

Paragraph 9:
  Found 'A' (similarity: 90)

------------------------------

Paragraph 10:

------------------------------

Paragraph 11:

------------------------------

Paragraph 12:

------------------------------

Paragraph 13:

------------------------------

Paragraph 14:
  Found 'a.' (similarity: 90)

------------------------------

Paragraph 15:
  Found 'a' (similarity: 90)

------------------------------

Paragraph 16:

------------------------------

Paragraph 17:

------------------------------

Para