In [32]:
import cv2
import pytesseract
import xml.etree.ElementTree as ET
import numpy as np


In [None]:
# Replace with your Tesseract installation path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'


In [34]:


def parse_xml_and_extract_textline_polygons(xml_path):
    """
    Parses the XML file to extract polygons for each TextLine.
    Returns a list of polygons with their coordinates and IDs.
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    textline_polygons = []

    # Find all TextLines with polygons
    for text_line in root.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}TextLine'):
        for shape in text_line.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}Polygon'):
            points = shape.get('POINTS')
            
            # Skip if POINTS attribute is missing
            if not points:
                continue

            # Attempt to split into integer pairs
            try:
                point_values = points.split()
                
                # Ensure there's an even number of values
                if len(point_values) % 2 != 0:
                    print(f"Warning: Skipping a polygon due to odd number of coordinates: {points}")
                    continue
                
                # Group into (x, y) pairs
                polygon_points = [(int(point_values[i]), int(point_values[i + 1])) for i in range(0, len(point_values), 2)]
                textline_polygons.append(polygon_points)
            except (IndexError, ValueError) as e:
                print(f"Warning: Skipping a polygon due to invalid POINTS format: {points}")

    return textline_polygons


def extract_text_from_polygons(image_path, polygons):
    """
    Extracts text from polygonal regions in an image using Tesseract OCR.
    """
    image = cv2.imread(image_path)
    recognized_texts = []

    for i, polygon in enumerate(polygons):
        # Create a mask for the polygon
        mask = np.zeros(image.shape[:2], dtype=np.uint8)
        cv2.fillPoly(mask, [np.array(polygon, dtype=np.int32)], 255)
        
        # Mask the image to isolate the polygon region
        masked_image = cv2.bitwise_and(image, image, mask=mask)
        
        # Crop to the bounding rectangle of the polygon
        x, y, w, h = cv2.boundingRect(np.array(polygon))
        cropped_region = masked_image[y:y+h, x:x+w]

        # Perform OCR on the cropped polygon region
        # OCR with Hebrew using a specific page segmentation mode (e.g., --psm 7 for single lines)
        text = pytesseract.image_to_string(cropped_region, lang='heb', config='--psm 7')
        recognized_texts.append(text)
        print(f"Text for TextLine {i+1}: {text}")

    return recognized_texts

# Example usage
xml_path = 'XML Exports/IDGNAZIM0001008.xml'
image_path = 'Image Exports/IDGNAZIM0001008.tif'  # Replace with actual image path
polygons = parse_xml_and_extract_textline_polygons(xml_path)
recognized_texts = extract_text_from_polygons(image_path, polygons)


Text for TextLine 1: -גורי , חיים .

Text for TextLine 2: מסביב.לאותה ‏ נקודה | התום: הגור) ,

Text for TextLine 3: גלמרחב, .משא, 26.2.60, ע .2



In [3]:
recognized_texts

[]