In [1]:
import os
import json
import random
from datetime import datetime
import tempfile
import base64
from pathlib import Path
from dotenv import load_dotenv

from cosmosdb_utils import CosmosDBManager
from request_log import RequestLog

from ai_ocr.chains import get_structured_data, get_final_reasoning_from_markdown
from ai_ocr.process import process_pdf

## Static output schema - demo
### Load up demo assets and call the Doc Intell and subsequentially GPT-4T

In [8]:
system_prompt =  ''
with open('../demo/medical/system_prompt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()

output_schema = ''
with open('../demo/medical/surgery_output_schema.json', 'r') as file_output_schema:
    output_schema = file_output_schema.read()

input_path = '../demo/medical/eyes_surgery_pre_1_4.pdf'

ocr_response = process_pdf(file_to_ocr=input_path,
                        prompt=system_prompt,
                        json_schema=output_schema)
print(f'OCR Response: {ocr_response}')
            
#nl_reponse = get_final_reasoning_from_markdown(ocr_response)
#print(f"NL Response: {nl_reponse}")

/home/aga/azureai/azure-doc-extraction-gbb-ai/backend/temp/imgs/page_1_img_1.jpeg
/home/aga/azureai/azure-doc-extraction-gbb-ai/backend/temp/imgs/page_2_img_1.jpeg
OCR Response: {"id": "medical_report", "categorization": "medical_report", "title": "Medical Report", "type": "object", "properties": {"doctor": {"type": "object", "properties": {"specialty": "Oculistica", "name": "Dottore di Ricerca Responsabile Chirurgia Oculistica Clinica Rugani", "clinic": "Clinica Rugani - Siena", "phone": "335-8118324", "fax": "0577-270652"}}, "patient": {"type": "object", "properties": {"name": "Gallo Alberto"}}, "post_surgery_follow_up": {"type": "array", "items": [{"type": "object", "properties": {"period": "1 SETTIMANA", "date": "2002-12-06", "ODv": "Berre", "ODT": "", "OSv": "8/10", "OST": "", "therapy": "Fluoton x4"}}, {"type": "object", "properties": {"period": "1 MESE", "date": "2003-01-21", "ODv": "9/10", "ODT": "17", "OSv": "10/10", "OST": "17", "therapy": "Fluoton 4 volte al d\u00ec"}}, {"ty

## Dynamic output schema - demo

#### (run once) CosmosDB create schema for categories

In [2]:
from cosmosdb_utils import CosmosDBManager

db = CosmosDBManager()
medical_report_schema = ''
with open('../demo/medical/surgery_output_schema.json', 'r') as file_output_schema:
    medical_report_schema = file_output_schema.read()

db.create_schema('medical_report', json.loads(medical_report_schema))



### Step by step approach
1. extract OCR with Doc Intell  
2. Use GPT-4T to determine the category of the document  
3. Load the corresponding json_schema_output from db where categoryId = category determined  
4. Use GPT-4T to apply the schema to the markdown results of the OCR  
5. (optional) Use GPT-4T to provde a natural language summary of the results  

### 1. & 2. Extract + Classification

In [3]:
from ai_ocr.azure.doc_intelligence import get_ocr_results
from ai_ocr.chains import classify_doc_with_llm

input_path = '../demo/medical/eyes_surgery_pre_1_4.pdf'

#1. extract OCR with doc intell
ocr_result = get_ocr_results(input_path)
#print(ocr_result.content)

#2. classify with LLM
classification_sytem_prompt = ''
with open('../demo/medical/classification_system_prompt', 'r') as class_sys_prompt:
    classification_sytem_prompt = class_sys_prompt.read()

category = classify_doc_with_llm(ocr_result.content, classification_sytem_prompt)
print(category.content)

medical_report


### 3. Load output schema for the right category from CosmosDB

In [10]:
from cosmosdb_utils import CosmosDBManager

db = CosmosDBManager()
output_schema = db.read_schema(category.content)

if output_schema == 'None':
    print(f'Schema not found for category: {category.content}')
else:
    print(output_schema['title'])

Medical Report


### 4. Use GPT-4T to apply the schema on the OCR results

In [11]:
system_prompt =  ''
with open('../demo/medical/system_prompt', 'r') as file_sys_prompt:
    system_prompt = file_sys_prompt.read()


ocr_response = process_pdf(file_to_ocr=input_path,
                        prompt=system_prompt,
                        json_schema=json.dumps(output_schema))

print(f'OCR Response: {ocr_response}')
            


/home/aga/azureai/azure-doc-extraction-gbb-ai/backend/temp/imgs/page_1_img_1.jpeg
/home/aga/azureai/azure-doc-extraction-gbb-ai/backend/temp/imgs/page_2_img_1.jpeg
OCR Response: {"id": "medical_report", "categorization": "", "title": "Medical Report", "type": "object", "properties": {"doctor": {"type": "object", "properties": {"specialty": "Oculistica", "name": "Dottore di Ricerca Responsabile Chirurgia Oculistica Clinica Rugani", "clinic": "Clinica Rugani - Siena", "phone": "335-8118324", "fax": "0577-270652"}}, "patient": {"type": "object", "properties": {"name": "Gallo Alberto"}}, "post_surgery_follow_up": [{"period": "1 SETTIMANA", "date": "2002-12-06", "ODv": "Berre", "ODT": "", "OSv": "8/10", "OST": "", "therapy": "Fluoton x4"}, {"period": "1 MESE", "date": "2003-01-21", "ODv": "9/10", "ODT": "17", "OSv": "10/10", "OST": "17", "therapy": "Fluoton"}, {"period": "2-3 MESI", "date": "2003-03-18", "ODv": "10/10", "ODT": "18", "OSv": "10/10", "OST": "18", "therapy": "Fluoton x3 volte"

In [12]:
with open('output.json', 'w') as f:
    f.write(ocr_response)

### DB MGMT 

In [3]:
import pandas as pd

db = CosmosDBManager()
history = db.list_all_requests(10)
df = pd.DataFrame.from_dict(history)
df

Unnamed: 0,id,request_filename,request_timestamp,total_time_seconds,model_output,_rid,_self,_etag,_attachments,_ts
0,50692692,1.4.1.1 Zahnarzt.pdf,2024-04-30T13:07:51.262541,88.761757,"""The document is an invoice from Zahnklinik Us...",HOEEAJEW7ckBAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""c301d7cc-0000-4700-0000-6630d1610000""",attachments/,1714475361
1,63315044,1.4.1.1 Zahnarzt.pdf,2024-04-30T17:06:46.154020,86.945442,"""The document is a medical record from a clini...",HOEEAJEW7ckCAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""d501038c-0000-4700-0000-6631095e0000""",attachments/,1714489694
2,55312920,20220215_Austrittsbericht_Chirurgie_Stadtspita...,2024-04-30T17:09:14.051358,87.259308,"""The document is a medical record from a hospi...",HOEEAJEW7ckDAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""d501e8af-0000-4700-0000-663109f20000""",attachments/,1714489842
3,35705906,20170712_Operationsbericht_Chirurgie_Stadtspit...,2024-04-30T17:11:43.139089,95.006221,"""Esther Tester, born on 22nd September 1953, w...",HOEEAJEW7ckEAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""d50185d4-0000-4700-0000-66310a8f0000""",attachments/,1714489999
4,95108072,20150713_Austrittsbericht_Chirurgie_Stadtspita...,2024-05-03T09:21:28.230468,83.31432,"""Esther Tester, born on 22nd September 1953, w...",HOEEAJEW7ckFAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""0100bb98-0000-4700-0000-663490cb0000""",attachments/,1714720971
5,70373760,20220927_Befund_Radiologie.pdf,2024-05-03T09:28:31.253043,82.734924,"""Monika Tester, born on 05.12.1978, underwent ...",HOEEAJEW7ckGAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""0100199d-0000-4700-0000-663492710000""",attachments/,1714721393
6,96310883,20230120_MRI_Befund_Neurologie_USZ.pdf,2024-05-07T09:26:22.392612,151.342382,"""Testata Irina, born on 02.02.1990, underwent ...",HOEEAJEW7ckHAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""1100cdb0-0000-4700-0000-6639d8350000""",attachments/,1715066933
7,78751310,20150713_Austrittsbericht_Chirurgie_Stadtspita...,2024-05-07T09:30:45.949465,86.635126,"""Esther Tester, born on September 22, 1953, wa...",HOEEAJEW7ckIAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""1100f4b2-0000-4700-0000-6639d8fd0000""",attachments/,1715067133
8,71421922,885779_ACORD_129_2009_11_3p.pdf,2024-05-10T16:38:45.426824,226.3123,"""The document details an insurance policy for ...",HOEEAJEW7ckJAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""1e007c32-0000-4700-0000-663e32580000""",attachments/,1715352152
9,36379475,2.3.1.4 NoTarmed Beleg Ergotherapie.pdf,2024-05-14T14:45:19.787327,80.044383,"""The document is a \""Generic Invoice\"" categor...",HOEEAJEW7ckKAAAAAAAAAA==,dbs/HOEEAA==/colls/HOEEAJEW7ck=/docs/HOEEAJEW7...,"""1400a37f-0000-4700-0000-66435d300000""",attachments/,1715690800


In [4]:
cols = [5,6,7,8,9]
df.drop(df.columns[cols], axis=1, inplace=True)
df

Unnamed: 0,id,request_filename,request_timestamp,total_time_seconds,model_output
0,50692692,1.4.1.1 Zahnarzt.pdf,2024-04-30T13:07:51.262541,88.761757,"""The document is an invoice from Zahnklinik Us..."
1,63315044,1.4.1.1 Zahnarzt.pdf,2024-04-30T17:06:46.154020,86.945442,"""The document is a medical record from a clini..."
2,55312920,20220215_Austrittsbericht_Chirurgie_Stadtspita...,2024-04-30T17:09:14.051358,87.259308,"""The document is a medical record from a hospi..."
3,35705906,20170712_Operationsbericht_Chirurgie_Stadtspit...,2024-04-30T17:11:43.139089,95.006221,"""Esther Tester, born on 22nd September 1953, w..."
4,95108072,20150713_Austrittsbericht_Chirurgie_Stadtspita...,2024-05-03T09:21:28.230468,83.31432,"""Esther Tester, born on 22nd September 1953, w..."
5,70373760,20220927_Befund_Radiologie.pdf,2024-05-03T09:28:31.253043,82.734924,"""Monika Tester, born on 05.12.1978, underwent ..."
6,96310883,20230120_MRI_Befund_Neurologie_USZ.pdf,2024-05-07T09:26:22.392612,151.342382,"""Testata Irina, born on 02.02.1990, underwent ..."
7,78751310,20150713_Austrittsbericht_Chirurgie_Stadtspita...,2024-05-07T09:30:45.949465,86.635126,"""Esther Tester, born on September 22, 1953, wa..."
8,71421922,885779_ACORD_129_2009_11_3p.pdf,2024-05-10T16:38:45.426824,226.3123,"""The document details an insurance policy for ..."
9,36379475,2.3.1.4 NoTarmed Beleg Ergotherapie.pdf,2024-05-14T14:45:19.787327,80.044383,"""The document is a \""Generic Invoice\"" categor..."
