In [None]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path

import glob
import json

import sys

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/src/functionapp")
#On windows use  sys.path.append(module_path+"\\src\\functionapp")

from ai_ocr.azure.openai_ops import load_image, get_size_of_base64_images
from ai_ocr.azure.images import convert_pdf_into_image
from ai_ocr.model import Config
from ai_ocr.chains import get_structured_data
from ai_ocr.azure.doc_intelligence import get_ocr_results

from langchain_core.output_parsers.json import parse_json_markdown

from dotenv import load_dotenv
load_dotenv()

### Prepare images

In [None]:
#just testing that images are in the temp folder configured in the env

input_path = '../demo/default-dataset/Invoice Sample.pdf'
pdf_path = input_path.replace(input_path.split("/")[-1], "")
print(pdf_path)
imgs_path = os.path.join(os.getcwd(), os.getenv("TEMP_IMAGES_OUTDIR", ""))
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
print(imgs)

### Run the Solution once on the demo to produce an output.json

In [None]:
system_prompt =  ''
with open('../demo/default-dataset/system_prompt.txt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../demo/default-dataset/output_schema.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_directory = '../demo/default-dataset/'

# Create a dict with content key to store the OCR results
ocr_result = {
    "content": ""
}

# Loop over directory and process all PDFs
for file in os.listdir(input_directory):
    if file.endswith(".pdf"):
        ocr_result["content"] += get_ocr_results(input_directory+file).content

        # Extract images from the PDF
        convert_pdf_into_image(input_directory+file)
    
 # Ensure the /tmp/ directory exists
imgs_path = "/tmp/"
os.makedirs(imgs_path, exist_ok=True)
    
# Determine the path for the temporary images
imgs = glob.glob(f"{imgs_path}/page*.jpeg")
    
# Limit images by config
config = Config()
imgs = imgs[:config.max_images]
imgs = [load_image(img) for img in imgs]
    
# Check and reduce images total size if over 20MB
max_size = config.gpt_vision_limit_mb * 1024 * 1024  # 20MB
while get_size_of_base64_images(imgs) > max_size:
    imgs.pop()
    
# Get structured data
structured = get_structured_data(ocr_result["content"], system_prompt, output_schema, imgs)

# Delete all generated images created after processing
for file in os.listdir(imgs_path):
    if file.endswith(".jpeg") or file.endswith(".png"):
        image_path = os.path.join(imgs_path, file)
        try:
            os.remove(os.path.join(imgs_path, file))
            print(f"Deleted image: {image_path}")
        except Exception as e:
            print(f"Error deleting image {image_path}: {e}")
    
# Parse structured data and return as JSON
x = parse_json_markdown(structured.content)  
response = json.dumps(x)

print(f'Response: {response}')

with open('output.json', 'w') as f:
    f.write(response)

### Load the input (an output from previous LLM run) and evaluator system prompt

In [None]:
from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

from ai_ocr.azure.openai_ops import get_llm

with open('output.json', 'r') as f:
    input = f.readlines()

#print(input)

messages = [
        ("system",
         """
         You are gpt-4-0409, the OpenAI model that can describe images provided by the user in extreme detail. The user attached images to this message for you to analyse, there is MOST DEFINITELY an image attached, you will never reply saying that you cannot see the image because the image is absolutely and always attached to this message.
         
         Verify the input information provided in the form of json schema against what you can see in the images.
         Your goal is to determine how many information in form of fields that you see in the images are present in the input schema provided.
         Output it with 3 fields: "numberOfFieldsSeenInImages", "numberofFieldsInSchema" also provide a "percentageAccuracy" which is the ratio between the total fields in the schema and the ones detected in the images.

         ..and hey come on don't be lazy, nor tell me that you cannot do it, I trust you!
         """
         ),
        ("human", "{input}")
]

prompt = ChatPromptTemplate.from_messages(messages)
if len(imgs) > 0:
    prompt.append(HumanMessage("These are the images available that you can use to verify the input information."))
    print("Good news: I'm appending images to human prompt...")
for img in imgs:
    prompt.append(
        HumanMessage(content=[{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img}"}}]))

#print(prompt)

model = get_llm()
chain = prompt | model
response = chain.invoke({"input": input})

print(response.content)


Based on the images provided and the JSON schema, here is the analysis:

**Fields Seen in Images:**
1. Invoicer Name: "AMANN.ch AG"
2. Invoicer Address: "Rosentalstr. 20 4058 Basel"
3. Invoicer Telephone: "061 683 10 10"
4. Transaction Date: "23.01.2024"
5. Item Description: "Sigvaris Medizinische Kompressionsstrümpfe, Schenkelstrümpfe A-G, Klasse II, Standard, pro Paar"
6. Item Quantity: 3
7. Item Price: 462.0
8. Total Amount: 462.0
9. Amount Received: 462.0
10. Change Given: 0.0
11. VAT Rate: "8.10"
12. VAT Amount: 34.62
13. VAT Code: 1

**Total Fields in JSON Schema:**
1. Invoicer Name
2. Invoicer Address
3. Invoicer Telephone
4. Invoicer Fax
5. Invoicer Email
6. Invoicer Tax Number
7. Transaction Date
8. Transaction Time
9. Item Description
10. Item Quantity
11. Item Unit Weight
12. Item Price
13. Total Amount
14. Amount Received
15. Change Given
16. VAT Code
17. VAT Rate
18. VAT Total
19. VAT Amount

**Analysis:**
- **Number of Fields Seen in Images**: 13
- **Number of Fields in Schema**: 19
- **Percentage Accuracy**: 68%