In [None]:
from pydantic import BaseModel
from fungarium_ocr.core import FungariumOCR

input_dir = 'sample_images'

vsion_model = 'gpt-4o'

system_prompt = """
        The goal is to extract structured text from Fungi sample images. The rules are:
        1. Each image contains two sections of text chunks. One is the bar code, and the other is the sample information.
        
        2. The languages are only in English or German.
        
        3. The bar code chunk starts with 'Herbarium der ETH Zurich (ZT)' followed by a barcode with text such as 'ZT Myc 0105537'. The task here is to extract the barcode text.
        
        4. The sample information chunk starts with a division separator, such as 'Dr. F. Petrak, Mycotheca generalis.'
        This will become the value for the 'division' column. After the division separator, there are other structures defined by the following:
        
        5. Exicata Number and Species: Lines. For example, '204. Acetabula vulgaris Fuck', contains two pieces of information: * Exicata number → the number before the period (e.g., 204) * Species name → everything after the period (e.g., Acetabula vulgaris Fuck.) 
        
        6. Matrix and Locality: A line that holds the information (e.g., Ungarn; Comit. Gyor: Bonyretalap). If a sample is missing a Matrix and Locality line, leave it blank. Extract as it is, no more added information
        
        7. Date: A line that has a Roman numeral month plus year (e.g., V.1920, X.1924, XII.1924), from which you split out: * Month → Roman numeral (e.g., V, X, XII) * Year → numeric year (e.g., 1920, 1924). Note, sometimes the month can be a normal English month with abbreviations
        
        8. Collector: A line beginning with 'leg.' indicates the collector (e.g., leg. J. Cogolludo.)

        9. Image name: I will define later
        
        The output should be a JSON like structured dictionary with keys (image_name, barcode, division, exicata_number, species, matrix_locality, date, collector) 
        Remove unnecessary '\n' etc. Dont output anything else.
        """

user_prompt = 'Directly extract information with your own vision capabilities, not Python packages such as pytesseract'

openai_apikey = 'Input your own API keys here'

class OutputFormat(BaseModel):
    image_name: str
    barcode: str
    division: str
    exicata_number: str
    species: str
    matrix_locality: str
    date: str
    collector: str

FungariumOCR(openai_apikey=openai_apikey).batch_ocr(input_dir, vsion_model=vsion_model, system_prompt=system_prompt, user_prompt=user_prompt, response_format=OutputFormat)

Successfully saved Excel file to: sample_images\sample_images.xlsx
