In [1]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path

import glob
import json

import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/src/functionapp")
#On windows use  sys.path.append(module_path+"\\src\\functionapp")

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.azure.images import convert_pdf_into_image
from ai_ocr.model import Config
from ai_ocr.chains import get_structured_data
from ai_ocr.azure.doc_intelligence import get_ocr_results

from langchain_core.output_parsers.json import parse_json_markdown

from dotenv import load_dotenv
load_dotenv()

True

### Prepare images

In [2]:
#just testing that images are in the temp folder configured in the env

input_path = '../demo/default-dataset/Invoice Sample.pdf'
pdf_path = input_path.replace(input_path.split("/")[-1], "")
print(pdf_path)
imgs_path = os.path.join(os.getcwd(), os.getenv("TEMP_IMAGES_OUTDIR", ""))
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
print(imgs)

../demo/default-dataset/
[]


### Run the Solution once on the demo to produce an output.json

In [6]:
system_prompt =  ''
with open('../demo/default-dataset/system_prompt.txt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../demo/default-dataset/output_schema.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_path = '../demo/default-dataset/Invoice Sample.pdf'


ocr_start_time = datetime.now()
ocr_result = get_ocr_results(input_path)
   
 # Extract images from the PDF
convert_pdf_into_image(input_path)
    
 # Ensure the /tmp/ directory exists
imgs_path = "/tmp/"
os.makedirs(imgs_path, exist_ok=True)
    
# Determine the path for the temporary images
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
    
# Limit images by config
imgs = imgs[:8]
imgs = [load_image(img) for img in imgs]
    
# Check and reduce images total size if over 20MB
max_size = 20 * 1024 * 1024  # 20MB
while get_size_of_base64_images(imgs) > max_size:
    imgs.pop()
    
# Get structured data
structured = get_structured_data(ocr_result.content, system_prompt, output_schema, imgs)

# Delete all generated images created after processing
for img_path in glob.glob(f"{imgs_path}/page*.jpeg"):
    try:
        os.remove(img_path)
        print(f"Deleted image: {img_path}")
    except Exception as e:
        print(f"Error deleting image {img_path}: {e}")
    
# Parse structured data and return as JSON
x = parse_json_markdown(structured.content)  
response = json.dumps(x)

print(f'Response: {response}')

actual_output_path = "/tmp/output.json"
with open(actual_output_path, 'w') as f:
    f.write(response)

Saved image: /tmp/page_1.png


### Load the input (an output from previous LLM run) and evaluate using LLM as a judge

In [None]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

from ai_ocr.azure.openai_ops import get_llm

with open(actual_output_path, 'r') as f:
    input = f.readlines()

#print(input)

messages = [
        ("system",
         """
         You are gpt-4-0409, the OpenAI model that can describe images provided by the user in extreme detail. The user attached images to this message for you to analyse, there is MOST DEFINITELY an image attached, you will never reply saying that you cannot see the image because the image is absolutely and always attached to this message.
         
         Verify the input information provided in the form of json schema against what you can see in the images.
         Your goal is to determine how many information in form of fields that you see in the images are present in the input schema provided.
         Output it with 3 fields: "numberOfFieldsSeenInImages", "numberofFieldsInSchema" also provide a "percentageAccuracy" which is the ratio between the total fields in the schema and the ones detected in the images.

         ..and hey come on don't be lazy, nor tell me that you cannot do it, I trust you!
         """
         ),
        ("human", "{input}")
]

prompt = ChatPromptTemplate.from_messages(messages)
if len(imgs) > 0:
    prompt.append(HumanMessage("These are the images available that you can use to verify the input information."))
    print("Good news: I'm appending images to human prompt...")
for img in imgs:
    prompt.append(
        HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}]))

#print(prompt)

model = get_llm()
chain = prompt | model
response = chain.invoke({"input": input})

print(response.content)


Good news: I'm appending images to human prompt...
Based on the image provided and the JSON schema, here is the analysis:

**Fields Seen in the Image:**
1. Company Name (Happiest Valley Farms)
2. Date (November 30, 2022)
3. Invoice Number (#1234)
4. Bill To: Customer Name (Henry Ross)
5. Bill To: Customer ID (8675309)
6. Bill To: Address (123 Avenue A, Metropolis)
7. Bill To: Phone (123-456-7890)
8. Ship To: Recipient Name (Henry Ross)
9. Ship To: Address (123 Avenue A, Metropolis)
10. Ship To: Phone (123-456-7890)
11. Payment Due (December 30, 2022)
12. Delivery Date (December 7, 2022)
13. Salesperson (Luca Richter)
14. Payment Terms (Cash or check)
15. Shipping Method (Ground)
16. Shipping Terms (Returns not accepted)
17. Items: Quantity, Item Number, Description, Unit Price, Discount, Line Total (for each item)
18. Total Discount (5.0)
19. Subtotal (278.0)
20. Sales Tax (13.9)
21. Total (286.9)
22. Company Address (456 Anyroad, Anywhere)
23. Company Website (interestingsite.com)
24.

Based on the images provided and the JSON schema, here is the analysis:

**Fields Seen in Images:**
1. Invoicer Name: "AMANN.ch AG"
2. Invoicer Address: "Rosentalstr. 20 4058 Basel"
3. Invoicer Telephone: "061 683 10 10"
4. Transaction Date: "23.01.2024"
5. Item Description: "Sigvaris Medizinische Kompressionsstrümpfe, Schenkelstrümpfe A-G, Klasse II, Standard, pro Paar"
6. Item Quantity: 3
7. Item Price: 462.0
8. Total Amount: 462.0
9. Amount Received: 462.0
10. Change Given: 0.0
11. VAT Rate: "8.10"
12. VAT Amount: 34.62
13. VAT Code: 1

**Total Fields in JSON Schema:**
1. Invoicer Name
2. Invoicer Address
3. Invoicer Telephone
4. Invoicer Fax
5. Invoicer Email
6. Invoicer Tax Number
7. Transaction Date
8. Transaction Time
9. Item Description
10. Item Quantity
11. Item Unit Weight
12. Item Price
13. Total Amount
14. Amount Received
15. Change Given
16. VAT Code
17. VAT Rate
18. VAT Total
19. VAT Amount

**Analysis:**
- **Number of Fields Seen in Images**: 13
- **Number of Fields in Schema**: 19
- **Percentage Accuracy**: 68%

### Load the input (an output from previous LLM run), ground truth and create a jsonl file

In [5]:
import sys
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import time
from pprint import pprint

def compile_jsonl(ground_truth_path, actual_output_path, output_file):
    # Read the ground truth JSON file
    with open(ground_truth_path, 'r') as gt_file:
        ground_truth = json.load(gt_file)

    with open(eval_schema_path, 'r') as eval_schema_file:
        eval_schema = json.load(eval_schema_file)


    def merge_dicts(ground_truth, eval_schema):
        merged_dict = {}
        
        for key, value in ground_truth.items():
            if isinstance(value, dict):
                merged_dict[key] = merge_dicts(value, eval_schema.get(key, {}))
            elif isinstance(value, list):
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
            else:
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
        
        return merged_dict

    # Open the output file
    with open(output_file, 'w') as out_file:
        # Iterate over each actual output JSON file
        with open(actual_output_path, 'r') as af:
            actual_data = json.load(af)
            # Combine ground truth and actual data into one object
            combined_data = {"ground_truth": ground_truth, "actual": actual_data, "eval_schema":eval_schema}
            # Write the combined data as a single line in the jsonl file
            out_file.write(json.dumps(combined_data) + '\n')


ground_truth_path = f"{module_path}/demo/default-dataset/ground_truth.json"
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
eval_schema_path = f"{module_path}/demo/default-dataset/evaluation_schema.json"

compile_jsonl(ground_truth_path, actual_output_path, eval_data_path)



NameError: name 'actual_output_path' is not defined

### Evaluate using ground truth

In [None]:
from promptflow.evals.evaluate import evaluate
from src.evaluators.json_evaluator import JsonEvaluator
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
with open(eval_data_path) as file:
    data = json.load(file)
    ground_truth = data["ground_truth"]
    evaluation_schema = data["eval_schema"]

evaluators = {}
evaluator_config = {}
default_match_evaluator_config = {}
json_evaluator = JsonEvaluator()
evaluators["json_evaluator"] = json_evaluator
evaluator_config["json_evaluator"] = {
    "actual": "${data.actual}",
    "ground_truth": "${data.ground_truth}",
    "eval_schema": "${data.eval_schema}"
}

timestamp = time.strftime("%m_%d.%H.%M.%S")
output_path = f"{module_path}/notebooks/outputs/output_{timestamp}.json"

results = evaluate(
    evaluation_name="test_eval_1",
    data=eval_data_path,
    evaluators=evaluators,
    evaluator_config=evaluator_config,
    output_path=output_path
)
pprint(results)
