In [1]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path

import glob
import json

import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/src/functionapp")
#On windows use  sys.path.append(module_path+"\\src\\functionapp")

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.azure.images import convert_pdf_into_image
from ai_ocr.model import Config
from ai_ocr.chains import get_structured_data
from ai_ocr.azure.doc_intelligence import get_ocr_results

from langchain_core.output_parsers.json import parse_json_markdown

from dotenv import load_dotenv
load_dotenv()

True

### Prepare images

In [2]:
#just testing that images are in the temp folder configured in the env

input_path = '../demo/default-dataset/Invoice Sample.pdf'
pdf_path = input_path.replace(input_path.split("/")[-1], "")
print(pdf_path)
imgs_path = os.path.join(os.getcwd(), os.getenv("TEMP_IMAGES_OUTDIR", ""))
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
print(imgs)

../demo/default-dataset/
[]


### Run the Solution once on the demo to produce an output.json

In [3]:
system_prompt =  ''
with open('../demo/default-dataset/system_prompt.txt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../demo/default-dataset/output_schema.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_path = '../demo/default-dataset/Invoice Sample.pdf'


ocr_start_time = datetime.now()
ocr_result = get_ocr_results(input_path)
   
 # Extract images from the PDF
convert_pdf_into_image(input_path)
    
 # Ensure the /tmp/ directory exists
imgs_path = "/tmp/"
os.makedirs(imgs_path, exist_ok=True)
    
# Determine the path for the temporary images
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
    
# Limit images by config
imgs = imgs[:8]
imgs = [load_image(img) for img in imgs]
    
# Check and reduce images total size if over 20MB
max_size = 20 * 1024 * 1024  # 20MB
while get_size_of_base64_images(imgs) > max_size:
    imgs.pop()
    
# Get structured data
structured = get_structured_data(ocr_result.content, system_prompt, output_schema, imgs)

# Delete all generated images created after processing
for img_path in glob.glob(f"{imgs_path}/page*.jpeg"):
    try:
        os.remove(img_path)
        print(f"Deleted image: {img_path}")
    except Exception as e:
        print(f"Error deleting image {img_path}: {e}")
    
# Parse structured data and return as JSON
x = parse_json_markdown(structured.content)  
response = json.dumps(x)

print(f'Response: {response}')

actual_output_path = "/tmp/output.json"
with open(actual_output_path, 'w') as f:
    f.write(response)

Saved image: /tmp/page_1.png
Response: {"Customer Name": "Henry Ross", "Invoice Number": "1234", "Date": "November 30, 2022", "Billing info": {"Customer": "Henry Ross", "Customer ID": "8675309", "Address": "123 Avenue A, Metropolis", "Phone": "(123) 456-7890"}, "Payment Due": "December 30, 2022", "Salesperson": "Luca Richter", "Payment Terms": "Cash or check", "Shipping info": {"Recipient": "Henry Ross", "Address": "123 Avenue A, Metropolis", "Phone": "(123) 456-7890"}, "Delivery Date": "December 7, 2022", "Shipping Method": "Ground", "Shipping Terms": "Returns not accepted", "Table": {"Items": [{"Qty": "10", "Item#": "123", "Description": "Baby chicks", "Unit price": "5.00", "Discount": "10%", "Line total": "45.00"}, {"Qty": "2", "Item#": "444", "Description": "Heat lamps", "Unit price": "24.00", "Discount": "", "Line total": "48.00"}, {"Qty": "6", "Item#": "120", "Description": "Chicken roosts", "Unit price": "30.00", "Discount": "", "Line total": "180.00"}], "Total Discount": "5.00"

### Load the input (an output from previous LLM run), ground truth and create a jsonl file


In [4]:
import sys
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import time
from pprint import pprint

def compile_jsonl(ground_truth_path, actual_output_path, output_file):
    # Read the ground truth JSON file
    with open(ground_truth_path, 'r') as gt_file:
        ground_truth = json.load(gt_file)

    with open(eval_schema_path, 'r') as eval_schema_file:
        eval_schema = json.load(eval_schema_file)


    def merge_dicts(ground_truth, eval_schema):
        merged_dict = {}
        
        for key, value in ground_truth.items():
            if isinstance(value, dict):
                merged_dict[key] = merge_dicts(value, eval_schema.get(key, {}))
            elif isinstance(value, list):
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
            else:
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
        
        return merged_dict

    # Open the output file
    with open(output_file, 'w') as out_file:
        # Iterate over each actual output JSON file
        with open(actual_output_path, 'r') as af:
            actual_data = json.load(af)
            # Combine ground truth and actual data into one object
            combined_data = {"ground_truth": ground_truth, "actual": actual_data, "eval_schema":eval_schema}
            # Write the combined data as a single line in the jsonl file
            out_file.write(json.dumps(combined_data) + '\n')


ground_truth_path = f"{module_path}/demo/default-dataset/ground_truth.json"
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
eval_schema_path = f"{module_path}/demo/default-dataset/evaluation_schema.json"

compile_jsonl(ground_truth_path, actual_output_path, eval_data_path)



Evaluate using ground truth

In [5]:
from promptflow.evals.evaluate import evaluate
from src.evaluators.json_evaluator import JsonEvaluator
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
with open(eval_data_path) as file:
    data = json.load(file)
    ground_truth = data["ground_truth"]
    evaluation_schema = data["eval_schema"]

evaluators = {}
evaluator_config = {}
default_match_evaluator_config = {}
json_evaluator = JsonEvaluator(
    default_match_evaluator_config
)
evaluators["json_evaluator"] = json_evaluator
evaluator_config["json_evaluator"] = {
    "actual": "${data.actual}",
    "ground_truth": "${data.ground_truth}",
    "eval_schema": "${data.eval_schema}"
}

timestamp = time.strftime("%m_%d.%H.%M.%S")
output_path = f"{module_path}/notebooks/outputs/output_{timestamp}.json"

results = evaluate(
    evaluation_name="test_eval_1",
    data=eval_data_path,
    evaluators=evaluators,
    evaluator_config=evaluator_config,
    output_path=output_path
)
pprint(results)


[2024-07-31 19:47:03 +0000][promptflow._sdk._orchestrator.run_submitter][INFO] - Submitting run src_evaluators_json_evaluator_jsonevaluator_fmxanugs_20240731_194703_639783, log path: /home/vscode/.promptflow/.runs/src_evaluators_json_evaluator_jsonevaluator_fmxanugs_20240731_194703_639783/logs.txt


Prompt flow service has started...
You can view the traces in local from http://127.0.0.1:23333/v1.0/ui/traces/?#run=src_evaluators_json_evaluator_jsonevaluator_fmxanugs_20240731_194703_639783
2024-07-31 19:47:06 +0000   93810 execution.bulk     INFO     Process 93830 terminated.


[2024-07-31 19:47:07 +0000][promptflow.evals.evaluate._utils][ERROR] - Unable to log traces as trace destination was not defined.


2024-07-31 19:47:03 +0000   93619 execution.bulk     INFO     Current thread is not main thread, skip signal handler registration in BatchEngine.
2024-07-31 19:47:03 +0000   93619 execution.bulk     INFO     Set process count to 1 by taking the minimum value among the factors of {'default_worker_count': 4, 'row_count': 1}.
2024-07-31 19:47:05 +0000   93619 execution.bulk     INFO     Process name(ForkProcess-2:1)-Process id(93830)-Line number(0) start execution.
2024-07-31 19:47:05 +0000   93619 execution.bulk     INFO     Process name(ForkProcess-2:1)-Process id(93830)-Line number(0) completed.
2024-07-31 19:47:06 +0000   93619 execution.bulk     INFO     Finished 1 / 1 lines.
2024-07-31 19:47:06 +0000   93619 execution.bulk     INFO     Average execution time for completed lines: 3.0 seconds. Estimated time for incomplete lines: 0.0 seconds.
2024-07-31 19:47:06 +0000   93619 execution.bulk     INFO     The thread monitoring the process [93830-ForkProcess-2:1] will be terminated.
2024