In [None]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path

import glob
import json

import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/src/functionapp")
#On windows use  sys.path.append(module_path+"\\src\\functionapp")

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.azure.images import convert_pdf_into_image
from ai_ocr.model import Config
from ai_ocr.chains import get_structured_data
from ai_ocr.azure.doc_intelligence import get_ocr_results

from langchain_core.output_parsers.json import parse_json_markdown

from dotenv import load_dotenv
load_dotenv()

### Prepare images

In [None]:
#just testing that images are in the temp folder configured in the env

input_path = '../demo/default-dataset/Invoice Sample.pdf'
#input_path = '../demo/evaltest-dataset/claim1.pdf'
pdf_path = input_path.replace(input_path.split("/")[-1], "")
print(pdf_path)
imgs_path = os.path.join(os.getcwd(), os.getenv("TEMP_IMAGES_OUTDIR", ""))
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
print(imgs)

### Run the Solution once on the demo to produce an output.json

In [None]:
system_prompt =  ''
with open('../demo/default-dataset/system_prompt.txt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../demo/default-dataset/output_schema.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_directory = '../demo/default-dataset/'
#input_directory = '../demo/evaltest-dataset/'

# Create a dict with content key to store the OCR results
ocr_result = {
    "content": ""
}

# Loop over directory and process all PDFs
for file in os.listdir(input_directory):
    if file.endswith(".pdf"):
        ocr_result["content"] += get_ocr_results(input_directory+file).content

        # Extract images from the PDF
        convert_pdf_into_image(input_directory+file)
    
 # Ensure the /tmp/ directory exists
imgs_path = "/tmp/"
os.makedirs(imgs_path, exist_ok=True)
    
# Determine the path for the temporary images
imgs = glob.glob(f"{imgs_path}/page*.png")
    
# Limit images by config
config = Config()
print(f"Config img size: {config.max_images}")
imgs = imgs[:config.max_images]
imgs = [load_image(img) for img in imgs]
print(f"Images count: {len(imgs)}")

# Check and reduce images total size if over 20MB
#max_size = 20 * 1024 * 1024  # 20MB
#while get_size_of_base64_images(imgs) > max_size:
#    imgs.pop()
    
# Get structured data
structured = get_structured_data(ocr_result["content"], system_prompt, output_schema, imgs)
    
# Parse structured data and return as JSON
x = parse_json_markdown(structured.content)  
response = json.dumps(x)

print(f'Response: {response}')

actual_output_path = "/tmp/output.json"
with open(actual_output_path, 'w') as f:
    f.write(response)

### Load the input (an output from previous LLM run) and evaluate using LLM as a judge

In [None]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

from ai_ocr.azure.openai_ops import get_llm

with open(actual_output_path, 'r') as f:
    input = f.readlines()

#print(input)

messages = [
        ("system",
         """
         You are the OpenAI model that can describe images provided by the user in extreme detail. 
         The user attached images to this message for you to analyse, there is MOST DEFINITELY an image attached, you will never reply saying that you cannot see the image because the image is absolutely and always attached to this message.
         
         Your tasks are the following:
         1. Verify the data provided in the json input fields against what you can see in the images.
         For each of the field in the input json give an accuracy score compared to the same field you see in the images (from 0 to 10 where 0 is wrong and 10 is '100%' correct).
         In the images, there may be empty or not populated fields, you can ignore these, pay attention to boxes or checkboxes where the value to extract is usually marked with a cross or an 'X'.
         Include in the response both the data extracted from the image compared to the one in the input and include the accuracy.
         
         2. Determine how many fields are present in the input providedcompared to the ones you see in the images.
         Output it with 4 fields: "numberOfFieldsSeenInImages", "numberofFieldsInSchema" also provide a "percentagePresenceAccuracy" which is the ratio between the total fields in the schema and the ones detected in the images, the last field "overallFieldAccuracy" is the sum of the accuracy you gave for each field in percentage.
         Don't include any other information in the response.
                  
         ..and take your time to complete the tasks.
         """
         ),
        ("human", "{input}")
]

prompt = ChatPromptTemplate.from_messages(messages)
if len(imgs) > 0:
    prompt.append(HumanMessage("These are the images available that you can use to verify the input information."))
    print("Good news: I'm appending images to human prompt...")
for img in imgs:
    prompt.append(
        HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}]))

#print(prompt)

model = get_llm()
chain = prompt | model
response = chain.invoke({"input": input})

print(response.content)


In [5]:
# Delete all generated images created after processing
for file in os.listdir(imgs_path):
    if file.endswith(".jpeg") or file.endswith(".png"):
        image_path = os.path.join(imgs_path, file)
        try:
            os.remove(os.path.join(imgs_path, file))
            print(f"Deleted image: {image_path}")
        except Exception as e:
            print(f"Error deleting image {image_path}: {e}")

Deleted image: /tmp/page_1.png


### Load the input (an output from previous LLM run), ground truth and create a jsonl file

In [None]:
import sys
if module_path not in sys.path:
    sys.path.append(module_path)

import json
import time
from pprint import pprint

def compile_jsonl(ground_truth_path, actual_output_path, output_file):
    # Read the ground truth JSON file
    with open(ground_truth_path, 'r') as gt_file:
        ground_truth = json.load(gt_file)

    with open(eval_schema_path, 'r') as eval_schema_file:
        eval_schema = json.load(eval_schema_file)


    def merge_dicts(ground_truth, eval_schema):
        merged_dict = {}
        
        for key, value in ground_truth.items():
            if isinstance(value, dict):
                merged_dict[key] = merge_dicts(value, eval_schema.get(key, {}))
            elif isinstance(value, list):
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
            else:
                merged_dict[key] = {
                    "value": value,
                    "eval": eval_schema.get(key, [])
                }
        
        return merged_dict

    # Open the output file
    with open(output_file, 'w') as out_file:
        # Iterate over each actual output JSON file
        with open(actual_output_path, 'r') as af:
            actual_data = json.load(af)
            # Combine ground truth and actual data into one object
            combined_data = {"ground_truth": ground_truth, "actual": actual_data, "eval_schema":eval_schema}
            # Write the combined data as a single line in the jsonl file
            out_file.write(json.dumps(combined_data) + '\n')


ground_truth_path = f"{module_path}/demo/default-dataset/ground_truth.json"
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
eval_schema_path = f"{module_path}/demo/default-dataset/evaluation_schema.json"

compile_jsonl(ground_truth_path, actual_output_path, eval_data_path)



### Evaluate using ground truth

In [None]:
from promptflow.evals.evaluate import evaluate
from src.evaluators.json_evaluator import JsonEvaluator
eval_data_path = f"{module_path}/demo/default-dataset/eval_data.jsonl"
with open(eval_data_path) as file:
    data = json.load(file)
    ground_truth = data["ground_truth"]
    evaluation_schema = data["eval_schema"]

evaluators = {}
evaluator_config = {}
default_match_evaluator_config = {}
json_evaluator = JsonEvaluator()
evaluators["json_evaluator"] = json_evaluator
evaluator_config["json_evaluator"] = {
    "actual": "${data.actual}",
    "ground_truth": "${data.ground_truth}",
    "eval_schema": "${data.eval_schema}"
}

timestamp = time.strftime("%m_%d.%H.%M.%S")
output_path = f"{module_path}/notebooks/outputs/output_{timestamp}.json"

results = evaluate(
    evaluation_name="test_eval_1",
    data=eval_data_path,
    evaluators=evaluators,
    evaluator_config=evaluator_config,
    output_path=output_path
)
pprint(results)
