In [2]:
import os
import json
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import ipywidgets as widgets
from PIL import Image
import sys

# Add parent directory to path for OCR utils
sys.path.append(os.path.abspath('..'))
from backend.utils.utils import run_paddle_ocr

# Paths
image_dir = "../data/invoices-donut/train"
json_dir = "../data/invoices-donut/donut_json/train"

# Debug: Check if directories exist
print(f"📁 Image directory: {image_dir} (exists: {os.path.exists(image_dir)})")
print(f"📁 JSON directory: {json_dir} (exists: {os.path.exists(json_dir)})")

# Load filenames
if os.path.exists(image_dir):
    image_files = sorted([f for f in os.listdir(image_dir) if f.endswith((".png", ".jpg", ".jpeg"))])
    print(f"📄 Found {len(image_files)} image files")
else:
    image_files = []
    print("❌ Image directory not found!")

# Create widgets with better sizing
dropdown = widgets.Dropdown(
    options=image_files,
    description='Invoice:',
    layout=widgets.Layout(width='60%')
)

progress_label = widgets.HTML(
    value="<b>Progress: 0/0</b>",
    layout=widgets.Layout(width='40%')
)

json_textarea = widgets.Textarea(
    description='JSON:',
    layout=widgets.Layout(width='100%', height='340px')
)

ocr_textarea = widgets.Textarea(
    description='OCR Tokens:',
    layout=widgets.Layout(width='100%', height='340px'),
    disabled=True
)

save_button = widgets.Button(
    description='💾 Save Changes',
    button_style='success',
    layout=widgets.Layout(width='120px')
)

prev_button = widgets.Button(
    description='← Previous',
    button_style='info',
    layout=widgets.Layout(width='100px')
)

next_button = widgets.Button(
    description='Next →',
    button_style='info',
    layout=widgets.Layout(width='100px')
)

refresh_ocr_button = widgets.Button(
    description='🔄 Refresh OCR',
    button_style='warning',
    layout=widgets.Layout(width='120px')
)

quick_fill_dropdown = widgets.Dropdown(
    options=['Select token to copy...'],
    description='Quick Copy:',
    layout=widgets.Layout(width='200px')
)

validation_status = widgets.HTML(
    value="<span style='color: gray;'>Ready</span>",
    layout=widgets.Layout(width='100%')
)

auto_save_checkbox = widgets.Checkbox(
    value=True,
    description='Auto-save',
    layout=widgets.Layout(width='100px')
)

def update_progress():
    if image_files:
        current_idx = image_files.index(dropdown.value) + 1
        total = len(image_files)
        progress_label.value = f"<b>Progress: {current_idx}/{total} ({current_idx/total*100:.1f}%)</b>"

def update_quick_fill_options(tokens):
    token_options = ['Select token to copy...']
    for i, token in enumerate(tokens):
        text = token.get('text', '').strip()
        if text:
            clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
            if len(clean_text) > 30:
                clean_text = clean_text[:30] + "..."
            token_options.append(f"{i+1:2d}. {clean_text}")
    quick_fill_dropdown.options = token_options

def validate_json():
    try:
        data = json.loads(json_textarea.value)
        required_fields = ['supplier_name', 'invoice_number', 'invoice_total']
        missing_fields = [field for field in required_fields if not data.get(field)]
        if missing_fields:
            validation_status.value = f"<span style='color: orange;'>⚠️ Missing: {', '.join(missing_fields)}</span>"
        else:
            validation_status.value = "<span style='color: green;'>✅ Valid JSON with required fields</span>"
        return True
    except json.JSONDecodeError as e:
        validation_status.value = f"<span style='color: red;'>❌ Invalid JSON: {str(e)}</span>"
        return False
    except Exception as e:
        validation_status.value = f"<span style='color: red;'>❌ Error: {str(e)}</span>"
        return False

def go_prev(b):
    current_idx = image_files.index(dropdown.value)
    if current_idx > 0:
        dropdown.value = image_files[current_idx - 1]

def go_next(b):
    current_idx = image_files.index(dropdown.value)
    if current_idx < len(image_files) - 1:
        dropdown.value = image_files[current_idx + 1]

def refresh_ocr(b):
    filename = dropdown.value
    img_path = os.path.join(image_dir, filename)
    try:
        tokens = run_paddle_ocr(img_path)
        token_lines = [f"📄 OCR TOKENS FOR {filename} ({len(tokens)} tokens)"]
        token_lines.append("=" * 50)
        for i, token in enumerate(tokens):
            text = token.get('text', '').strip()
            if text:
                clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
                token_lines.append(f"{i+1:2d}. {clean_text}")
        token_lines.append("\n" + "💡 TIP: Use Quick Copy dropdown or copy directly from here")
        ocr_textarea.value = '\n'.join(token_lines)
        update_quick_fill_options(tokens)
    except Exception as e:
        ocr_textarea.value = f"❌ Error running OCR: {str(e)}"
        update_quick_fill_options([])

def on_quick_fill_change(change):
    if change.new != 'Select token to copy...' and change.new:
        text = change.new.split('. ', 1)[-1]
        if text.endswith('...'):
            try:
                token_idx = int(change.new.split('.')[0]) - 1
                filename = dropdown.value
                img_path = os.path.join(image_dir, filename)
                tokens = run_paddle_ocr(img_path)
                if token_idx < len(tokens):
                    text = tokens[token_idx].get('text', '').strip()
            except:
                pass
        print(f"📋 Copied: {text}")
        print("💡 Paste this into the JSON field above")

prev_button.on_click(go_prev)
next_button.on_click(go_next)
refresh_ocr_button.on_click(refresh_ocr)
quick_fill_dropdown.observe(on_quick_fill_change, names='value')

output_widget = widgets.Output()

@output_widget.capture()
def show_data(change):
    output_widget.clear_output()
    filename = change.new
    img_path = os.path.join(image_dir, filename)
    # Only search in the main test directory, not /valid
    json_path = os.path.join(json_dir, os.path.splitext(filename)[0] + ".json")
    # Display image
    with output_widget:
        img = Image.open(img_path)
        plt.figure(figsize=(7, 5))
        plt.imshow(img)
        plt.axis('off')
        plt.title(f"{filename} ({img.size[0]}x{img.size[1]})", fontsize=9)
        plt.tight_layout()
        plt.show()
    # Load and show JSON
    if os.path.exists(json_path):
        with open(json_path, "r", encoding='utf-8') as f:
            data = json.load(f)
        json_textarea.value = json.dumps(data, indent=2, ensure_ascii=False)
    else:
        json_textarea.value = '{\n  "supplier_name": "",\n  "supplier_address": "",\n  "customer_name": "",\n  "customer_address": "",\n  "invoice_number": "",\n  "invoice_date": "",\n  "due_date": "",\n  "tax_amount": "",\n  "tax_rate": "",\n  "invoice_subtotal": "",\n  "invoice_total": "",\n  "items": []\n}'
    # Load and show OCR tokens
    try:
        tokens = run_paddle_ocr(img_path)
        token_lines = [f"📄 OCR TOKENS FOR {filename} ({len(tokens)} tokens)"]
        token_lines.append("=" * 50)
        for i, token in enumerate(tokens):
            text = token.get('text', '').strip()
            if text:
                clean_text = text.replace('\n', ' ').replace('\r', ' ').strip()
                token_lines.append(f"{clean_text}")
        token_lines.append("\n" + "💡 TIP: Use Quick Copy dropdown or copy directly from here")
        ocr_textarea.value = '\n'.join(token_lines)
        update_quick_fill_options(tokens)
    except Exception as e:
        ocr_textarea.value = f"❌ Error loading OCR tokens: {str(e)}"
        update_quick_fill_options([])
    json_textarea.json_path = json_path
    update_progress()
    validate_json()

def save_json(b):
    try:
        if not validate_json():
            print("❌ Cannot save: JSON validation failed")
            return
        data = json.loads(json_textarea.value)
        os.makedirs(os.path.dirname(json_textarea.json_path), exist_ok=True)
        with open(json_textarea.json_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"✅ Saved changes to {os.path.basename(json_textarea.json_path)}")
        if auto_save_checkbox.value:
            current_idx = image_files.index(dropdown.value)
            if current_idx < len(image_files) - 1:
                dropdown.value = image_files[current_idx + 1]
    except json.JSONDecodeError as e:
        print(f"❌ Invalid JSON: {str(e)}")
    except Exception as e:
        print(f"❌ Error saving file: {str(e)}")

def on_json_change(change):
    validate_json()

json_textarea.observe(on_json_change, names='value')
dropdown.observe(show_data, names='value')
save_button.on_click(save_json)

# Layout: show all buttons (save, quick copy, auto-save, etc.)
header_row = widgets.HBox([dropdown, progress_label])
nav_buttons = widgets.HBox([prev_button, next_button, refresh_ocr_button, save_button, auto_save_checkbox])
quick_tools = widgets.HBox([quick_fill_dropdown])

left_panel = widgets.VBox([
    header_row,
    nav_buttons,
    output_widget
], layout=widgets.Layout(width='42%'))

right_panel = widgets.VBox([
    widgets.HTML("<h4>📝 JSON Data</h4>"),
    validation_status,
    json_textarea,
    widgets.HTML("<h4>🔤 OCR Tokens</h4>"),
    quick_tools,
    ocr_textarea
], layout=widgets.Layout(width='58%'))

widget_box = widgets.HBox([
    left_panel,
    right_panel
], layout=widgets.Layout(width='100%', height='750px'))

display(widget_box)

display(HTML("""
<style>
.widget-textarea textarea {
    color: black !important;
    background: white !important;
    font-family: 'Courier New', monospace;
    font-size: 10px;
    padding: 6px;
    line-height: 1.3;
}
.jupyter-widgets-view {
    width: 100%;
    max-height: 750px;
    overflow: hidden;
}
.widget-html h4 {
    color: #333;
    font-weight: bold;
    margin: 5px 0 3px 0;
    font-size: 14px;
}
.widget-checkbox input {
    margin-right: 5px;
}
</style>
"""))

if image_files:
    dropdown.value = image_files[0]
else:
    print("❌ No image files found!")

📁 Image directory: ../data/invoices-donut/train (exists: True)
📁 JSON directory: ../data/invoices-donut/donut_json/train (exists: True)
📄 Found 420 image files


HBox(children=(VBox(children=(HBox(children=(Dropdown(description='Invoice:', layout=Layout(width='60%'), opti…