In [None]:
# %%
# --- PATHS ---
# Base directory for your processed editions
# Assumes structure: .../images/Volume1/*.png and .../labels/Volume1/*.txt
IMAGES_ROOT = 'Editions/Padova_1618_Cesare_Ripa'
LABELS_ROOT = 'finetuned_complete/structured_inference_output/Padova_1618_Cesare_Ripa/txt'
OUTPUT_DIR = 'runs/pipeline_output'
BINARY_CACHE_DIR = 'runs/binary_cache'
# Configuration and Models
DATA_YAML_PATH = 'RIPA-ft/data.yaml'
MODEL_WEIGHTS = 'my_finetune_project/run_ladas_1280_l_v14/weights/best.pt' 
KRAKEN_SEG_MODEL = 'models/blla.mlmodel' 
OCR_MODEL_PATH = '10.5281/zenodo.2577963'

In [6]:
# %% [markdown]
# # End-to-End Document Layout Analysis & OCR Pipeline
# 
# **Goal:** Convert raw images and YOLO coordinates into standard archival formats (ALTO & TEI).
# 
# **Workflow:**
# 1.  **Setup:** Load class mappings from `data.yaml` and initialize Kraken models.
# 2.  **Inference (Optional):** Run YOLO to generate coordinate files (if missing).
# 3.  **Processing:** #     * Read YOLO coordinates.
#     * Crop regions.
#     * Run Kraken Line Segmentation & OCR on text regions.
#     * Generate ALTO XML (Physical Layout).
#     * Convert to TEI XML (Logical Structure).

# %%
# --- DIAGNOSTIC CHECK ---
import torch
import sys
print(f"Python Version: {sys.version}")
print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("⚠️ WARNING: Running on CPU. This will be slow.")
    print("To fix, install CUDA torch: pip install torch torchvision --index-url https://download.pytorch.org/whl/cu118")

# %%
import os
import glob
import time
import yaml
import torch
import numpy as np
from pathlib import Path
from PIL import Image
from lxml import etree as ET
from ultralytics import YOLO
from tqdm.auto import tqdm 

# Kraken imports
from kraken.lib import vgsl
from kraken.blla import segment as blla_segment
from kraken.rpred import rpred
from kraken.binarization import nlbin 

# %% [markdown]
# ## 1. Configuration & Paths
# Define where your data lives and where output should go.



# --- DYNAMIC CLASS MAPPING ---
def load_class_mapping(yaml_path):
    if not os.path.exists(yaml_path):
        print(f"Warning: {yaml_path} not found. Using minimal fallback.")
        return {0: 'MainZone'}
        
    with open(yaml_path, 'r') as f:
        data = yaml.safe_load(f)
    return {i: name for i, name in enumerate(data['names'])}

YOLO_TO_SEGMONTO = load_class_mapping(DATA_YAML_PATH)
print(f"Loaded {len(YOLO_TO_SEGMONTO)} classes.")

# %% [markdown]
# ## 2. TEI Conversion Logic
# Converts the physical ALTO layout into a logical TEI structure (`<sourceDoc>` + `<body>`).

# %%
def alto_to_tei(alto_tree, image_filename):
    NS_ALTO = {'a': 'http://www.loc.gov/standards/alto/ns-v4#'}
    NS_TEI = 'http://www.tei-c.org/ns/1.0'
    XML_NS = "{http://www.w3.org/XML/1998/namespace}"
    
    root = ET.Element(f'{{{NS_TEI}}}TEI', nsmap={None: NS_TEI})
    
    # --- 1. teiHeader ---
    header = ET.SubElement(root, 'teiHeader')
    file_desc = ET.SubElement(header, 'fileDesc')
    title_stmt = ET.SubElement(file_desc, 'titleStmt')
    ET.SubElement(title_stmt, 'title').text = f"Transcription of {image_filename}"
    
    # --- 2. sourceDoc (Facsimile/Physical Layer) ---
    source_doc = ET.SubElement(root, 'sourceDoc')
    surface = ET.SubElement(source_doc, 'surface')
    
    page_el = alto_tree.find('.//a:Page', NS_ALTO)
    if page_el is not None:
        surface.set('lry', page_el.get('HEIGHT', '0'))
        surface.set('lrx', page_el.get('WIDTH', '0'))
    
    surface.set(f'{XML_NS}id', f"page_{image_filename}")
    
    for block in alto_tree.findall('.//a:TextBlock', NS_ALTO):
        zone = ET.SubElement(surface, 'zone')
        zone.set(f'{XML_NS}id', block.get('ID'))
        zone.set('type', block.get('TAGREFS', 'MainZone'))
        
        for attr in ['HPOS', 'VPOS', 'WIDTH', 'HEIGHT']:
            if block.get(attr): zone.set(attr.lower(), block.get(attr))
        
        for line in block.findall('.//a:TextLine', NS_ALTO):
            ET.SubElement(zone, 'line').set(f'{XML_NS}id', line.get('ID'))
    
    # --- 3. Text Body (Logical Layer) ---
    text = ET.SubElement(root, 'text')
    body = ET.SubElement(text, 'body')
    div = ET.SubElement(body, 'div', type="page")
    
    for block in alto_tree.findall('.//a:TextBlock', NS_ALTO):
        zone_type = block.get('TAGREFS', 'MainZone')
        
        # Mapping Logic based on your schema
        if 'RunningTitle' in zone_type:
            el_name, attrs = 'fw', {'type': 'header', 'place': 'top'}
        elif 'Numbering' in zone_type:
            el_name, attrs = 'fw', {'type': 'page-number', 'place': 'top'}
        elif 'QuireMarks' in zone_type:
            el_name, attrs = 'fw', {'type': 'signature', 'place': 'bottom'}
        elif 'MarginTextZone' in zone_type:
            subtype = zone_type.split('-')[-1] if '-' in zone_type else 'margin'
            el_name, attrs = 'note', {'place': 'margin', 'type': subtype}
        elif 'MainZone-Head' in zone_type:
            el_name, attrs = 'head', {}
        elif 'TitlePage' in zone_type:
            el_name, attrs = 'div', {'type': 'titlepage'}
        elif 'Graphic' in zone_type or 'Figure' in zone_type:
            el_name, attrs = 'figure', {'type': zone_type}
        elif 'Table' in zone_type:
            el_name, attrs = 'table', {}
        elif 'MainZone' in zone_type:
            subtype = zone_type.split('-')[-1] if '-' in zone_type else 'main'
            el_name, attrs = 'ab', {'type': subtype}
        else:
            el_name, attrs = 'ab', {'type': zone_type}
            
        struct_block = ET.SubElement(div, el_name, **attrs)
        struct_block.set('facs', f"#{block.get('ID')}")
        
        for line in block.findall('.//a:TextLine', NS_ALTO):
            lb = ET.SubElement(struct_block, 'lb')
            lb.set('facs', f"#{line.get('ID')}")
            
            string_el = line.find('.//a:String', NS_ALTO)
            if string_el is not None and string_el.get('CONTENT'):
                lb.tail = string_el.get('CONTENT')
            
    return ET.ElementTree(root)

# %% [markdown]
# ## 3. Page Processing Logic
# **STRATEGY:** Per-Region Processing with Strict Input Sanitization.
# 
# 1.  **Segmentation:** Run on RGB Crop.
# 2.  **Sanitize:** Remove bad lines (0-area polygons) that crash OCR.
# 3.  **OCR:** Run on **NLBIN (Adaptive Binary)** crop to ensure text visibility.

# %%
def clamp(val, max_val):
    return max(0, min(val, max_val))

def process_single_page(image_path, txt_path, output_xml, output_tei, seg_model, ocr_model, device):
    try: im = Image.open(image_path)
    except Exception as e: 
        print(f"Error opening image {image_path}: {e}")
        return
    
    # Convert to RGB for consistent indexing
    im_rgb = im.convert('RGB')
    w_img, h_img = im_rgb.size
    
    if not os.path.exists(txt_path): return

    # Init ALTO
    ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#"
    ANS = f"{{{ALTO_NS}}}" 
    root = ET.Element(f'{ANS}alto', nsmap={None: ALTO_NS})
    page = ET.SubElement(ET.SubElement(root, f'{ANS}Layout'), f'{ANS}Page', 
                         WIDTH=str(w_img), HEIGHT=str(h_img), ID="p1")
    print_space = ET.SubElement(page, f'{ANS}PrintSpace', HPOS="0", VPOS="0", 
                                WIDTH=str(w_img), HEIGHT=str(h_img))

    with open(txt_path, 'r') as f: lines = f.readlines()
    
    # DEBUG STATS
    stats = {'regions': 0, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}
    
    for idx, line in enumerate(lines):
        parts = line.strip().split()
        if len(parts) < 5: continue
        
        cls_id = int(parts[0])
        cx, cy, w, h = map(float, parts[1:5])
        
        abs_w, abs_h = int(w * w_img), int(h * h_img)
        abs_x, abs_y = int((cx * w_img) - (abs_w/2)), int((cy * h_img) - (abs_h/2))
        abs_x, abs_y = max(0, abs_x), max(0, abs_y)
        abs_w = min(abs_w, w_img - abs_x)
        abs_h = min(abs_h, h_img - abs_y)
        
        if abs_w < 15 or abs_h < 15: continue 

        label = YOLO_TO_SEGMONTO.get(cls_id, 'MainZone')
        stats['regions'] += 1
        
        block = ET.SubElement(print_space, f'{ANS}TextBlock', ID=f"region_{idx}",
                              HPOS=str(abs_x), VPOS=str(abs_y),
                              WIDTH=str(abs_w), HEIGHT=str(abs_h),
                              TAGREFS=label)
        
        text_prefixes = [
            'Main', 'Margin', 'Running', 'Numbering', 'Quire', 
            'Title', 'Table', 'Drop', 'Stamp', 'Figure', 'Graphic'
        ]
        is_text = any(label.startswith(p) for p in text_prefixes) or \
                  label == 'GraphicZone-TextualContent'
        
        if is_text:
            try:
                # 1. CROP (RGB)
                crop_rgb = im_rgb.crop((abs_x, abs_y, abs_x+abs_w, abs_y+abs_h))
                
                # 2. SEGMENTATION
                with torch.no_grad():
                    res = blla_segment(crop_rgb, model=seg_model, device=device)
                
                has_lines = getattr(res, 'lines', []) if not isinstance(res, dict) else res.get('lines', [])
                stats['lines_found'] += len(has_lines)
                
                if not has_lines:
                    continue

                # 3. SANITIZATION
                valid_lines = []
                for l in has_lines:
                    if isinstance(l, dict):
                        boundary = l.get('boundary')
                        baseline = l.get('baseline')
                    else:
                        boundary = getattr(l, 'boundary', None)
                        baseline = getattr(l, 'baseline', None)
                    
                    if not boundary or len(boundary) < 3: continue
                    if not baseline or len(baseline) < 2: continue
                    
                    xs = [p[0] for p in boundary]
                    ys = [p[1] for p in boundary]
                    if max(xs) - min(xs) < 1 or max(ys) - min(ys) < 1: continue
                    
                    valid_lines.append(l)
                
                stats['lines_valid'] += len(valid_lines)
                
                if not valid_lines:
                    continue

                # 4. OCR PREP (CRITICAL CHANGE: Use nlbin)
                # Convert crop to Binary using Adaptive Thresholding
                # This prevents historical paper from turning purely black/white
                try:
                    crop_bin = nlbin(crop_rgb)
                except:
                    # Fallback only if nlbin crashes
                    crop_bin = crop_rgb.convert('1')
                
                if not isinstance(res, dict): res.lines = valid_lines
                else: res['lines'] = valid_lines
                
                # 5. RUN OCR
                with torch.no_grad():
                    pred_it = rpred(ocr_model, crop_bin, res)
                    
                    for l_idx, (l_data, ocr_record) in enumerate(zip(valid_lines, pred_it)):
                        if isinstance(l_data, dict): boundary = l_data['boundary']
                        else: boundary = l_data.boundary
                            
                        pg_bound = [(p[0]+abs_x, p[1]+abs_y) for p in boundary]
                        xs, ys = [p[0] for p in pg_bound], [p[1] for p in pg_bound]
                        lx, ly = min(xs), min(ys)
                        lw, lh = max(xs)-lx, max(ys)-ly
                        
                        text_line = ET.SubElement(block, f'{ANS}TextLine', ID=f"line_{idx}_{l_idx}",
                                                  HPOS=str(int(lx)), VPOS=str(int(ly)),
                                                  WIDTH=str(int(lw)), HEIGHT=str(int(lh)))
                        
                        text_content = ocr_record.prediction
                        
                        # DEBUG: Print text if it exists
                        # if len(text_content) > 0: print(f"OCR Output: {text_content[:20]}...")
                        
                        stats['text_generated'] += 1
                        # Always write string element, even if empty, to debug
                        ET.SubElement(text_line, f'{ANS}String', 
                                      CONTENT=text_content,
                                      HPOS=str(int(lx)), VPOS=str(int(ly)),
                                      WIDTH=str(int(lw)), HEIGHT=str(int(lh)))
                                          
            except Exception as e:
                print(f"[ERROR] Region {idx} failed: {e}")
                continue

    print(f"Page {os.path.basename(image_path)} Stats: {stats}")

    # 5. Save
    ET.ElementTree(root).write(output_xml, encoding='UTF-8', xml_declaration=True, pretty_print=True)
    tei_tree = alto_to_tei(root, os.path.basename(image_path))
    tei_tree.write(output_tei, encoding='UTF-8', xml_declaration=True, pretty_print=True)

# ... existing execution block ...

Python Version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]
PyTorch Version: 2.7.1+cu118
CUDA Available: True
GPU Name: NVIDIA GeForce RTX 4060 Laptop GPU
Loaded 30 classes.


In [22]:
def run_full_pipeline():
    print("--- Starting Pipeline ---")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"   Running on: {device}")
    
    # OPTIONAL: Run binarization first
    # precompute_binarization()
    
    try:
        seg_model = vgsl.TorchVGSLModel.load_model(KRAKEN_SEG_MODEL)
        seg_model.to(device)
        ocr_model = vgsl.TorchVGSLModel.load_model(OCR_MODEL_PATH)
        ocr_model.to(device)
        seg_model.eval()
        ocr_model.eval()
    except Exception as e:
        print(f"CRITICAL ERROR: Could not load models. {e}")
        return

    image_files = glob.glob(os.path.join(IMAGES_ROOT, "**", "*.png"), recursive=True)
    image_files += glob.glob(os.path.join(IMAGES_ROOT, "**", "*.jpg"), recursive=True)
    
    alto_dir = os.path.join(OUTPUT_DIR, 'alto')
    tei_dir = os.path.join(OUTPUT_DIR, 'tei')
    os.makedirs(alto_dir, exist_ok=True)
    os.makedirs(tei_dir, exist_ok=True)
    
    print(f"Found {len(image_files)} images.")
    
    for img_path in tqdm(image_files, desc="Processing Pages"):
        rel_path = os.path.relpath(img_path, IMAGES_ROOT)
        rel_txt_path = os.path.splitext(rel_path)[0] + ".txt"
        txt_path = os.path.join(LABELS_ROOT, rel_txt_path)
        
        flat_name = rel_path.replace(os.sep, "_")
        base_name = os.path.splitext(flat_name)[0]
        
        xml_path = os.path.join(alto_dir, f"{base_name}.xml")
        tei_path = os.path.join(tei_dir, f"{base_name}_tei.xml")
        
        if not os.path.exists(txt_path):
            continue
            
        process_single_page(img_path, txt_path, xml_path, tei_path, seg_model, ocr_model, device)
        
    print("--- Complete! ---")

# Uncomment to run
# run_full_pipeline()

In [26]:
import gc  # ADDED: For memory cleanup


In [2]:
import gc

In [8]:

# %%
def sanity_check_ocr():
    print("--- OCR Sanity Check ---")
    
    # 1. AGGRESSIVE MEMORY CLEANUP
    if 'ocr_model' in globals(): del globals()['ocr_model']
    if 'seg_model' in globals(): del globals()['seg_model']
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        print(f"   VRAM cleared. Allocated: {torch.cuda.memory_allocated()/1024**2:.2f} MB")

    # Define container class locally to ensure compatibility
    class SanityContainer:
        def __init__(self):
            self.type = 'baselines'
            self.imagename = 'sanity_check.png'
            self.lines = []
            self.regions = {}
            self.script_detection = False # The attribute that caused the crash

    try:
        # 2. Load Model
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        print(f"   Loading model on {device}...")
        ocr_model = vgsl.TorchVGSLModel.load_model(OCR_MODEL_PATH)
        ocr_model.to(device)
        ocr_model.eval()
        
        # 3. Create Synthetic Image
        from PIL import ImageDraw, ImageFont
        img = Image.new('L', (500, 100), color=255)
        draw = ImageDraw.Draw(img)
        
        try:
            # Try to load default font
            # font = ImageFont.load_default() # Sometimes fails to render readable text
            draw.text((10, 40), "Test Transcription 123", fill=0)
        except:
            draw.line((10, 50, 490, 50), fill=0, width=5)
            
        # 4. Binarize & Convert (Mimic Pipeline)
        img_bin = img.convert('1')
        img_input = img_bin.convert('L')
        
        print("   Running inference...")
        with torch.no_grad():
            # Construct the Object Container instead of Dict
            bounds = SanityContainer()
            bounds.lines = [{
                'boundary': [(0,0), (500,0), (500,100), (0,100)],
                'baseline': [(10,80), (490,80)],
                'text_direction': 'horizontal-lr',
                'script': 'default'
            }]
            
            pred_it = rpred(ocr_model, img_input, bounds)
            for record in pred_it:
                print(f"✅ SUCCESS! Prediction: '{record.prediction}'")
                
                del ocr_model
                torch.cuda.empty_cache()
                return

    except Exception as e:
        print(f"❌ FATAL: Sanity check failed. {e}")
        if device == 'cuda':
            print("   Tip: Try restarting the kernel to fully clear GPU memory.")

In [9]:
sanity_check_ocr()

--- OCR Sanity Check ---
   VRAM cleared. Allocated: 0.00 MB
   Loading model on cuda...
   Running inference...
❌ FATAL: Sanity check failed. 'MultiParamSequential' object has no attribute 'input'
   Tip: Try restarting the kernel to fully clear GPU memory.


In [23]:
run_full_pipeline()

--- Starting Pipeline ---
   Running on: cuda
Found 704 images.


Processing Pages:   0%|          | 0/704 [00:00<?, ?it/s]

Page page_0001.png Stats: {'regions': 1, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}
Page page_0002.png Stats: {'regions': 0, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}
Page page_0003.png Stats: {'regions': 0, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}
Page page_0004.png Stats: {'regions': 3, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}


Polygonizer failed on line 0: index 2 is out of bounds for axis 0 with size 0


[ERROR] Region 0 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 1 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 3 failed: 'MultiParamSequential' object has no attribute 'input'
Page page_0005.png Stats: {'regions': 4, 'lines_found': 50, 'lines_valid': 50, 'text_generated': 0}
Page page_0006.png Stats: {'regions': 1, 'lines_found': 0, 'lines_valid': 0, 'text_generated': 0}
[ERROR] Region 1 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 4 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 5 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 6 failed: 'MultiParamSequential' object has no attribute 'input'
Page page_0007.png Stats: {'regions': 8, 'lines_found': 58, 'lines_valid': 55, 'text_generated': 0}
[ERROR] Region 0 failed: 'MultiParamSequential' object has no attribute 'input'
[ERROR] Region 1 failed: CUDA error: out of memory
CUDA kernel

In [None]:
# %% [markdown]
# # End-to-End Document Layout Analysis & OCR Pipeline
# 
# **Goal:** Convert raw images and YOLO coordinates into standard archival formats (ALTO & TEI).
# 
# **Workflow:**
# 1.  **Setup:** Load class mappings from `data.yaml` and initialize Kraken models.
# 2.  **Inference (Optional):** Run YOLO to generate coordinate files (if missing).
# 3.  **Processing:** #     * Read YOLO coordinates.
#     * Crop regions.
#     * Run Kraken Line Segmentation & OCR on text regions.
#     * Generate ALTO XML (Physical Layout).
#     * Convert to TEI XML (Logical Structure).

# %%
import os
import glob
import time
import yaml
import torch
from pathlib import Path
from PIL import Image
from lxml import etree as ET
from ultralytics import YOLO

# Kraken imports
from kraken.lib import vgsl
from kraken.blla import segment as blla_segment
from kraken.rpred import rpred

# %% [markdown]
# ## 1. Configuration & Paths
# Define where your data lives and where output should go.



# --- DYNAMIC CLASS MAPPING ---
# Loads your 30 classes from data.yaml to ensure names match exactly
def load_class_mapping(yaml_path):
    if not os.path.exists(yaml_path):
        print(f"Warning: {yaml_path} not found. Using minimal fallback.")
        return {0: 'MainZone'}
        
    with open(yaml_path, 'r') as f:
        data = yaml.safe_load(f)
    # Returns dict: {0: 'DigitizationArtefactZone', 1: 'DropCapitalZone', ...}
    return {i: name for i, name in enumerate(data['names'])}

YOLO_TO_SEGMONTO = load_class_mapping(DATA_YAML_PATH)
print(f"Loaded {len(YOLO_TO_SEGMONTO)} classes.")

# %% [markdown]
# ## 2. TEI Conversion Logic
# Converts the physical ALTO layout into a logical TEI structure (`<sourceDoc>` + `<body>`).

# %%
def alto_to_tei(alto_tree, image_filename):
    NS_ALTO = {'a': 'http://www.loc.gov/standards/alto/ns-v4#'}
    NS_TEI = 'http://www.tei-c.org/ns/1.0'
    
    root = ET.Element(f'{{{NS_TEI}}}TEI', nsmap={None: NS_TEI})
    
    # --- 1. teiHeader ---
    header = ET.SubElement(root, 'teiHeader')
    file_desc = ET.SubElement(header, 'fileDesc')
    title_stmt = ET.SubElement(file_desc, 'titleStmt')
    ET.SubElement(title_stmt, 'title').text = f"Transcription of {image_filename}"
    
    # --- 2. sourceDoc (Facsimile/Physical Layer) ---
    source_doc = ET.SubElement(root, 'sourceDoc')
    surface = ET.SubElement(source_doc, 'surface')
    
    page_el = alto_tree.find('.//a:Page', NS_ALTO)
    if page_el is not None:
        surface.set('lry', page_el.get('HEIGHT', '0'))
        surface.set('lrx', page_el.get('WIDTH', '0'))
    surface.set('xml:id', f"page_{image_filename}")
    
    # Copy ALTO blocks to TEI Zones
    for block in alto_tree.findall('.//a:TextBlock', NS_ALTO):
        zone = ET.SubElement(surface, 'zone')
        zone.set('xml:id', block.get('ID'))
        zone.set('type', block.get('TAGREFS', 'MainZone'))
        
        # Copy coordinates
        for attr in ['HPOS', 'VPOS', 'WIDTH', 'HEIGHT']:
            if block.get(attr): zone.set(attr.lower(), block.get(attr))
        
        # Copy lines
        for line in block.findall('.//a:TextLine', NS_ALTO):
            ET.SubElement(zone, 'line').set('xml:id', line.get('ID'))
    
    # --- 3. Text Body (Logical Layer) ---
    text = ET.SubElement(root, 'text')
    body = ET.SubElement(text, 'body')
    div = ET.SubElement(body, 'div', type="page")
    
    # Map SegmOnto zones to TEI semantic elements
    for block in alto_tree.findall('.//a:TextBlock', NS_ALTO):
        zone_type = block.get('TAGREFS', 'MainZone')
        
        # Mapping Logic based on your schema
        if 'RunningTitle' in zone_type:
            el_name, attrs = 'fw', {'type': 'header', 'place': 'top'}
        elif 'Numbering' in zone_type:
            el_name, attrs = 'fw', {'type': 'page-number', 'place': 'top'}
        elif 'QuireMarks' in zone_type:
            el_name, attrs = 'fw', {'type': 'signature', 'place': 'bottom'}
        elif 'MarginTextZone' in zone_type:
            # Extract 'Notes' from 'MarginTextZone-Notes'
            subtype = zone_type.split('-')[-1] if '-' in zone_type else 'margin'
            el_name, attrs = 'note', {'place': 'margin', 'type': subtype}
        elif 'MainZone-Head' in zone_type:
            el_name, attrs = 'head', {}
        elif 'TitlePage' in zone_type:
            el_name, attrs = 'div', {'type': 'titlepage'}
        elif 'Graphic' in zone_type or 'Figure' in zone_type:
            el_name, attrs = 'figure', {'type': zone_type}
        elif 'Table' in zone_type:
            el_name, attrs = 'table', {}
        elif 'MainZone' in zone_type:
            # Extract 'P' from 'MainZone-P'
            subtype = zone_type.split('-')[-1] if '-' in zone_type else 'main'
            el_name, attrs = 'ab', {'type': subtype}
        else:
            # Fallback for Artifacts, Stickers, etc.
            el_name, attrs = 'ab', {'type': zone_type}
            
        # Create logical block linked to facsimile
        struct_block = ET.SubElement(div, el_name, **attrs)
        struct_block.set('facs', f"#{block.get('ID')}")
        
        # Add lines and Text Content
        for line in block.findall('.//a:TextLine', NS_ALTO):
            lb = ET.SubElement(struct_block, 'lb')
            lb.set('facs', f"#{line.get('ID')}")
            
            # Get OCR content if it exists
            string_el = line.find('.//a:String', NS_ALTO)
            if string_el is not None and string_el.get('CONTENT'):
                lb.tail = string_el.get('CONTENT')
            
    return ET.ElementTree(root)

# %% [markdown]
# ## 3. Page Processing Logic
# The core function that processes one image + text file pair.

# %%
def process_single_page(image_path, txt_path, output_xml, output_tei, seg_model, ocr_model, device):
    # 1. Load Image
    try: im = Image.open(image_path)
    except Exception as e: 
        print(f"Error opening image {image_path}: {e}")
        return
    
    w_img, h_img = im.size
    
    if not os.path.exists(txt_path): 
        print(f"WARNING: Missing label file: {txt_path}")
        return

    # 2. Init ALTO Structure
    NS = {'': 'http://www.loc.gov/standards/alto/ns-v4#'}
    root = ET.Element('alto', xmlns="http://www.loc.gov/standards/alto/ns-v4#")
    page = ET.SubElement(ET.SubElement(root, 'Layout'), 'Page', 
                         WIDTH=str(w_img), HEIGHT=str(h_img), ID="p1")
    print_space = ET.SubElement(page, 'PrintSpace', HPOS="0", VPOS="0", 
                                WIDTH=str(w_img), HEIGHT=str(h_img))

    # 3. Process YOLO Regions
    with open(txt_path, 'r') as f: lines = f.readlines()
    
    for idx, line in enumerate(lines):
        parts = line.strip().split()
        if len(parts) < 5: continue
        
        # Parse YOLO Format (class cx cy w h)
        cls_id = int(parts[0])
        cx, cy, w, h = map(float, parts[1:5])
        
        # Denormalize to Pixels
        abs_w, abs_h = int(w * w_img), int(h * h_img)
        abs_x, abs_y = int((cx * w_img) - (abs_w/2)), int((cy * h_img) - (abs_h/2))
        
        # Clip to image bounds (safety)
        abs_x = max(0, abs_x)
        abs_y = max(0, abs_y)
        abs_w = min(abs_w, w_img - abs_x)
        abs_h = min(abs_h, h_img - abs_y)
        
        if abs_w < 5 or abs_h < 5: continue # Skip tiny noise

        # Get Class Name (Fallback to MainZone if unknown)
        label = YOLO_TO_SEGMONTO.get(cls_id, 'MainZone')
        
        # Create ALTO TextBlock
        block = ET.SubElement(print_space, 'TextBlock', ID=f"region_{idx}",
                              HPOS=str(abs_x), VPOS=str(abs_y),
                              WIDTH=str(abs_w), HEIGHT=str(abs_h),
                              TAGREFS=label)
        
        # 4. Kraken Segmentation & OCR
        # We only run expensive OCR on zones that actually contain text
        text_prefixes = [
            'Main', 'Margin', 'Running', 'Numbering', 'Quire', 
            'Title', 'Table', 'Drop', 'Stamp', 'Figure', 'Graphic'
        ]
        # Check if label starts with any prefix OR is explicitly GraphicZone-TextualContent
        is_text = any(label.startswith(p) for p in text_prefixes) or \
                  label == 'GraphicZone-TextualContent'
        
        if is_text:
            try:
                # Crop the region from the main image
                crop = im.crop((abs_x, abs_y, abs_x+abs_w, abs_y+abs_h))
                
                # A. Segment Lines (BLLA)
                res = blla_segment(crop, model=seg_model, device=device)
                
                # B. Run Recognition (OCR) if lines found
                if res and len(res['lines']) > 0:
                    pred_it = rpred(ocr_model, crop, res)
                    
                    for l_idx, (l_data, ocr_record) in enumerate(zip(res['lines'], pred_it)):
                        # Shift local crop coordinates back to global page coordinates
                        pg_bound = [(p[0]+abs_x, p[1]+abs_y) for p in l_data['boundary']]
                        xs, ys = [p[0] for p in pg_bound], [p[1] for p in pg_bound]
                        lx, ly = min(xs), min(ys)
                        lw, lh = max(xs)-lx, max(ys)-ly
                        
                        text_line = ET.SubElement(block, 'TextLine', ID=f"line_{idx}_{l_idx}",
                                                  HPOS=str(int(lx)), VPOS=str(int(ly)),
                                                  WIDTH=str(int(lw)), HEIGHT=str(int(lh)))
                        
                        # Add OCR Text Content
                        ET.SubElement(text_line, 'String', 
                                      CONTENT=ocr_record.prediction,
                                      HPOS=str(int(lx)), VPOS=str(int(ly)),
                                      WIDTH=str(int(lw)), HEIGHT=str(int(lh)))
            except Exception as e:
                print(f"Warning: Segmentation/OCR failed for region {idx} ({label}) in {os.path.basename(image_path)}: {e}")
                continue

    # 5. Save Results
    ET.ElementTree(root).write(output_xml, encoding='UTF-8', xml_declaration=True, pretty_print=True)
    
    tei_tree = alto_to_tei(root, os.path.basename(image_path))
    tei_tree.write(output_tei, encoding='UTF-8', xml_declaration=True, pretty_print=True)

# %% [markdown]
# ## 4. Main Execution Loop
# Runs the pipeline on your dataset.

# %%
def run_full_pipeline():
    print("--- Starting Pipeline ---")
    print("1. Loading Kraken Models (Seg + OCR)...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    try:
        seg_model = vgsl.TorchVGSLModel.load_model(KRAKEN_SEG_MODEL)
        seg_model.to(device)
        ocr_model = vgsl.TorchVGSLModel.load_model(OCR_MODEL_PATH)
        ocr_model.to(device)
    except Exception as e:
        print(f"CRITICAL ERROR: Could not load models. {e}")
        return

    # Find images recursively
    image_files = glob.glob(os.path.join(IMAGES_ROOT, "**", "*.png"), recursive=True)
    image_files += glob.glob(os.path.join(IMAGES_ROOT, "**", "*.jpg"), recursive=True)
    
    # Setup output structure
    alto_dir = os.path.join(OUTPUT_DIR, 'alto')
    tei_dir = os.path.join(OUTPUT_DIR, 'tei')
    os.makedirs(alto_dir, exist_ok=True)
    os.makedirs(tei_dir, exist_ok=True)
    
    print(f"Found {len(image_files)} images in {IMAGES_ROOT}")
    print(f"Results will be saved to {OUTPUT_DIR}")
    
    start_time = time.time()
    
    for img_path in image_files:
        # Logic to find corresponding label file
        # Calculates relative path to preserve subfolders if necessary
        rel_path = os.path.relpath(img_path, IMAGES_ROOT)
        rel_txt_path = os.path.splitext(rel_path)[0] + ".txt"
        
        # Construct absolute path to label
        txt_path = os.path.join(LABELS_ROOT, rel_txt_path)
        
        # Construct output filenames (flattened for simplicity in output folder)
        flat_name = rel_path.replace(os.sep, "_")
        base_name = os.path.splitext(flat_name)[0]
        
        xml_path = os.path.join(alto_dir, f"{base_name}.xml")
        tei_path = os.path.join(tei_dir, f"{base_name}_tei.xml")
        
        # Only process if we have coordinates
        if not os.path.exists(txt_path):
            # print(f"Skipping {base_name} (No coordinates found at {txt_path})")
            continue
            
        print(f"Processing {base_name}...")
        process_single_page(img_path, txt_path, xml_path, tei_path, seg_model, ocr_model, device)
        
    print(f"--- Complete! Total time: {time.time() - start_time:.2f}s ---")

# Uncomment to run
run_full_pipeline()

Loaded 30 classes.
--- Starting Pipeline ---
1. Loading Kraken Models (Seg + OCR)...
Found 704 images in Editions/Padova_1618_Cesare_Ripa
Results will be saved to runs/pipeline_output
Processing page_0001...


ValueError: Invalid attribute name 'xml:id'

In [3]:
pip install "numpy<2" --force-reinstall

Collecting numpy<2
  Obtaining dependency information for numpy<2 from https://files.pythonhosted.org/packages/3f/6b/5610004206cf7f8e7ad91c5a85a8c71b2f2f8051a0c0c4d5916b76d6cbb2/numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata
  Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl.metadata (61 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------------ ------------------------- 20.5/61.0 kB 330.3 kB/s eta 0:00:01
     ------------------------------- ------ 51.2/61.0 kB 650.2 kB/s eta 0:00:01
     -------------------------------------- 61.0/61.0 kB 651.3 kB/s eta 0:00:00
Downloading numpy-1.26.4-cp311-cp311-win_amd64.whl (15.8 MB)
   ---------------------------------------- 0.0/15.8 MB ? eta -:--:--
   ---------------------------------------- 0.1/15.8 MB 8.2 MB/s eta 0:00:02
   - -------------------------------------- 0.5/15.8 MB 7.1 MB/s eta 0:00:03
   -- ------------------------------------- 0.9/15.8 MB 10.0 MB/s eta 0:00:02
   --- ----------------------

ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\lucia\\anaconda3\\Lib\\site-packages\\~-mpy.libs\\libscipy_openblas64_-9e3e5a4229c1ca39f10dc82bba9e2b2b.dll'
Consider using the `--user` option or check the permissions.

