In [1]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

from cosmosdb_utils import CosmosDBManager
from request_log import RequestLog

from langchain_core.messages import HumanMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import SystemMessage
from langchain_core.prompts import HumanMessagePromptTemplate

from ai_ocr.azure.openai_ops import get_llm
from ai_ocr.azure.doc_intelligence import get_ocr_results


### Let GPT4T create a new schema for you based on an input document

In [4]:
input_path = '../demo/claims/Invoice Sample.pdf'
category_id = 'invoice'

ocr_result = get_ocr_results(input_path)

messages = [
        ("system",
         """
         From the input I provide you. Determine a JSON schema that fits all the data. 
         Be sure to include these following fields to start with:
         "id" : """+category_id+""",
         "categorization" : "string",
         "title": "string",
         "type": "object",
         "properties" :
         then you continue creating the scehma with all the other data fields you see in the input under the "properties" object as root.
         No need to put the "required" array properties for each object.
         Don't fit the data of the input in the schema, just provde the schema definition with the fields and the types.
         Your response should only contain the schema definition starting and ending with curly brackets.
            
         """
         ),
        ("human", "{input}")
]

prompt = ChatPromptTemplate.from_messages(messages)
#print(prompt)

model = get_llm()
chain = prompt | model
response = chain.invoke({"input": ocr_result.content, "category_id": category_id})

print(response.content)

{
  "id": "invoice",
  "categorization": "string",
  "title": "string",
  "type": "object",
  "properties": {
    "invoiceNumber": {
      "type": "integer"
    },
    "date": {
      "type": "string"
    },
    "billTo": {
      "type": "object",
      "properties": {
        "customerName": {
          "type": "string"
        },
        "customerId": {
          "type": "integer"
        },
        "address": {
          "type": "string"
        },
        "phone": {
          "type": "string"
        }
      }
    },
    "shipTo": {
      "type": "object",
      "properties": {
        "recipientName": {
          "type": "string"
        },
        "address": {
          "type": "string"
        },
        "phone": {
          "type": "string"
        }
      }
    },
    "paymentDue": {
      "type": "string"
    },
    "deliveryDate": {
      "type": "string"
    },
    "salesperson": {
      "type": "string"
    },
    "paymentTerms": {
      "type": "string"
    },
    "shippi

#### CosmosDB - save a new schema for a specific category (key)

In [5]:
response.content


'{\n  "id": "invoice",\n  "categorization": "string",\n  "title": "string",\n  "type": "object",\n  "properties": {\n    "invoiceNumber": {\n      "type": "integer"\n    },\n    "date": {\n      "type": "string"\n    },\n    "billTo": {\n      "type": "object",\n      "properties": {\n        "customerName": {\n          "type": "string"\n        },\n        "customerId": {\n          "type": "integer"\n        },\n        "address": {\n          "type": "string"\n        },\n        "phone": {\n          "type": "string"\n        }\n      }\n    },\n    "shipTo": {\n      "type": "object",\n      "properties": {\n        "recipientName": {\n          "type": "string"\n        },\n        "address": {\n          "type": "string"\n        },\n        "phone": {\n          "type": "string"\n        }\n      }\n    },\n    "paymentDue": {\n      "type": "string"\n    },\n    "deliveryDate": {\n      "type": "string"\n    },\n    "salesperson": {\n      "type": "string"\n    },\n    "payme

In [6]:

db = CosmosDBManager()
db.create_schema(category_id, json.loads(response.content))


#### CosmosDB - list all schema

In [7]:
db = CosmosDBManager()

schemas = db.list_all_schema()
df = pd.DataFrame.from_dict(schemas)
df

Unnamed: 0,id,categorization,title,type,properties,_rid,_self,_etag,_attachments,_ts
0,invoice,string,string,object,"{'invoiceNumber': {'type': 'integer'}, 'date':...",HOEEAJKheGIBAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJKheGI=/docs/HOEEAJKhe...,"""080085a4-0000-4700-0000-664db8850000""",attachments/,1716369541


#### CosmosDB - delete all schemas (clean container)

In [2]:
db = CosmosDBManager()
db.delete_all_schema()

#### CosmosDB - List all request processed (with max)

In [None]:

db = CosmosDBManager()
history = db.list_all_requests(10)
df = pd.DataFrame.from_dict(history)
df