In [2]:
import os
from dotenv import load_dotenv

In [3]:
from openai import OpenAI
client = OpenAI()


In [4]:
load_dotenv()

True

In [48]:
from pydantic import BaseModel, Field
from typing import List

class BillItem(BaseModel):
    item_name: str
    # if field missing in JSON, default to 0.0
    item_amount: float = 0.0
    item_rate: float = 0.0
    item_quantity: float = 0.0

class PageLineItems(BaseModel):
    page_no: int
    page_type: str = "Bill Detail"
    bill_items: List[BillItem] = Field(default_factory=list)

class TokenUsage(BaseModel):
    total_tokens: int = 0
    input_tokens: int = 0
    output_tokens: int = 0

class InvoiceData(BaseModel):
    pagewise_line_items: List[PageLineItems] = Field(default_factory=list)
    total_item_count: int = 0
    token_usage: TokenUsage = Field(default_factory=TokenUsage)

class FinalOutput(BaseModel):
    is_success: bool
    data: InvoiceData


In [6]:
def validate_invoice_json(invoice_json: dict) -> FinalOutput:
    """
    Validate raw LLM JSON using Pydantic models.
    This ensures strict schema matching.
    """
    return FinalOutput(**invoice_json)


In [7]:
import os, json, base64
from io import BytesIO
import pdfplumber, shutil
from pdf2image import convert_from_path
from PIL import Image
from openai import OpenAI
from dotenv import load_dotenv
from pydantic import ValidationError

In [87]:



def extract_text_digital(pdf_path, page_index):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[page_index]
        return page.extract_text() or ""


In [54]:
PDF_PATH = "train_sample_10.pdf"

In [88]:
extract_text_digital(PDF_PATH, 2)

'Name : Mrs. GOPA DUTTA IP No : AMHLIP398580 Bill No : INT2043376\nPayer Payable\nSL. Service Service Name Alias Code Dr. Name Start Date End Date Qty/Duration RefTariff Dis( %)Amount(`)\nNo code ( Order No. ) ( in mins )\nConsultation(999311 )\nConsultation\n1 19631 IP CONSULTATION Dr. Soumen 1.00 1,000.00 0 1,000.00\nCHARGES Kar (\nORTHOPAEDIC\nS )\n2 19631 IP CONSULTATION Dr. Soumen 1.00 1,000.00 0 1,000.00\nCHARGES Kar (\nORTHOPAEDIC\nS )\n3 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n4 19631 IP CONSULTATION Dr. Nilanchal 1.00 1,000.00 0 1,000.00\nCHARGES Chakraborty (\nCRITICAL\nCARE )\n5 19631 IP CONSULTATION Dr. Nilanchal 1.00 1,000.00 0 1,000.00\nCHARGES Chakraborty (\nCRITICAL\nCARE )\n6 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n7 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n8 19631 IP CONSULTATION Dr. Bikram Das 1.00 1,00

In [11]:
import fitz  # PyMuPDF
import base64
from io import BytesIO
from PIL import Image

In [12]:
import fitz  # PyMuPDF
import base64
from io import BytesIO
from PIL import Image

def page_to_image_b64(pdf_path, page_index, all_images=None):
    if all_images:
        img = all_images[page_index]
        # If cached object is bytes from PyMuPDF
        if isinstance(img, bytes):
            return base64.b64encode(img).decode()

    doc = fitz.open(pdf_path)
    pix = doc[page_index].get_pixmap()
    
    # Convert to JPEG using PIL
    pil_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
    buf = BytesIO()
    pil_img.save(buf, format="JPEG", quality=90)
    
    return base64.b64encode(buf.getvalue()).decode()

In [13]:
doc = fitz.open(PDF_PATH)

In [14]:
print(len(doc))

3


In [15]:
page_to_image_b64(PDF_PATH, 2)

'/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAMCAgMCAgMDAwMEAwMEBQgFBQQEBQoHBwYIDAoMDAsKCwsNDhIQDQ4RDgsLEBYQERMUFRUVDA8XGBYUGBIUFRT/2wBDAQMEBAUEBQkFBQkUDQsNFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBQUFBT/wAARCAMYAmQDASIAAhEBAxEB/8QAHwAAAQUBAQEBAQEAAAAAAAAAAAECAwQFBgcICQoL/8QAtRAAAgEDAwIEAwUFBAQAAAF9AQIDAAQRBRIhMUEGE1FhByJxFDKBkaEII0KxwRVS0fAkM2JyggkKFhcYGRolJicoKSo0NTY3ODk6Q0RFRkdISUpTVFVWV1hZWmNkZWZnaGlqc3R1dnd4eXqDhIWGh4iJipKTlJWWl5iZmqKjpKWmp6ipqrKztLW2t7i5usLDxMXGx8jJytLT1NXW19jZ2uHi4+Tl5ufo6erx8vP09fb3+Pn6/8QAHwEAAwEBAQEBAQEBAQAAAAAAAAECAwQFBgcICQoL/8QAtREAAgECBAQDBAcFBAQAAQJ3AAECAxEEBSExBhJBUQdhcRMiMoEIFEKRobHBCSMzUvAVYnLRChYkNOEl8RcYGRomJygpKjU2Nzg5OkNERUZHSElKU1RVVldYWVpjZGVmZ2hpanN0dXZ3eHl6goOEhYaHiImKkpOUlZaXmJmaoqOkpaanqKmqsrO0tba3uLm6wsPExcbHyMnK0tPU1dbX2Nna4uPk5ebn6Onq8vP09fb3+Pn6/9oADAMBAAIRAxEAPwD9U6KKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKACiiigAooooAKKKKAC

In [16]:
def ocr_page_with_vision(b64_img):
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
            "role": "user",
            "content": [
                {"type": "text", "text": "Extract invoice text"},
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64_img}"}}
            ]
        }]
    )
    return resp.choices[0].message.content

In [17]:
ocr_page_with_vision(page_to_image_b64(PDF_PATH, 1))

'Here is the extracted text from the invoice:\n\n| S/# | Description        | Cpt Code   | Date       | Qty | Rate   | Gross Amount | Discount |\n|-----|--------------------|------------|------------|-----|--------|--------------|----------|\n| 92  | Livi 300mg Tab     |            | 20/11/2025 | 14  | 32.00  | 448.00       | 0.00     |\n| 93  | Metunoro           |            | 20/11/2025 | 7   | 17.72  | 124.03       | 0.00     |\n| 94  | Pizat 4.5         |            | 20/11/2025 | 2   | 419.06 | 838.12       | 0.00     |\n| 95  | Supralite Os Syp   |            | 20/11/2025 | 1   | 289.69 | 289.69       | 0.00     |\n|     | **Category Total**  |            |            |     |        | **14,941.35** | **0.00** |'

In [76]:

def extract_structured_from_text(text):
    prompt = f"""
You are extracting structured medical/hospital billing data for insurance processing.
The input may be a digital PDF, scanned PDF, or handwritten invoice image.

First, determine the main bill details and the billing table content, then extract **only valid billing rows**.

### Table understanding:
Invoices are usually displayed as a table with columns similar to:
- S.No / Serial Number
- Service / Description / Item
- Rate / Unit Price
- Quantity / Qty / Units / Days
- Net Amount / Net Amt / Total Amount / Charge for that row

### Your task:
1. Treat each page as one bill document.
2. For each table row that represents a real billable charge, return ONE structured object inside `bill_items`.
3. Use only the "Net Amount / Net Amt" column for `item_amount`.
4. Use the "Rate / Unit Price" column for `item_rate`.
5. Use the "Quantity / Units / Days / Qty / Units" column for `item_quantity`.
6. IGNORE ALL of the following:
   - Section/heading rows (e.g., CONSULTATION, ROOM CHARGES, LAB CHARGES, RADIOLOGY, SURGERY).
   - Subtotal rows or calculated totals for a section (e.g., 1950.00, 4850.00, 12000.00 etc.).
   - Grand total, round off, discount, payable amount.
   - Any row whose description is only a total label (TOTAL, Totals, Grand Total, Subtotal, Net Total).
   - Duplicate or repeated rows referring to the same table entry.
   - Watermarks, logos, links, request formats, or unrelated handwritten notes.

7. If a row is a medical bill but rate or quantity is missing, set them to `0.0` (never null).
8. If a row is unrelated or invalid, do not include it in JSON.
9. Do **not merge multiple rows into one. One row = one `bill_items` object.

Output JSON in this exact format:
{{
  "is_success": True or False,
  "data": {{
    "pagewise_line_items": [
      {{
        "page_no": "integer as string",
        "page_type": "Bill Detail",
        "bill_items": [
          {{
            "item_name": "string",
            "item_amount": 0.0,
            "item_rate": 0.0 ,
            "item_quantity": 0.0
          }}
        ]
      }}
    ],
    "total_item_count": 0,
    "token_usage": {{
      "input_tokens": 0,
      "output_tokens": 0,
      "total_tokens": 0
    }}
  }}
}}
Only return JSON.





Rules:
- item_amount, item_rate, item_quantity must be numbers (no strings).
- If Rate or Quantity is missing on the row, use 0.0, not null.
- Do not include any row whose Description contains only words like "Total", "Totals", "Grand Total".

Only return valid JSON. No explanation.

Bill text (can include other noise, focus on the bill table):

\"\"\"{text}\"\"\"
"""

    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )

    
    usage = getattr(resp, "usage", None)

    raw = resp.choices[0].message.content.strip()
    if raw.startswith("```"):
        raw = raw.strip("`")
        raw = raw.replace("json", "", 1).strip()

   
    try:
        data = json.loads(raw)

        if usage is not None:
          # make sure the nested object exists
          data.setdefault("data", {})
          data["data"].setdefault("token_usage", {})

          
          data["data"]["token_usage"]["input_tokens"]  = usage.prompt_tokens
          data["data"]["token_usage"]["output_tokens"] = usage.completion_tokens
          data["data"]["token_usage"]["total_tokens"]  = usage.total_tokens

        # Return string again so your outer code still does json.loads(...)
        return json.dumps(data)

    except json.JSONDecodeError:
        # If something weird happens, just return raw (so you can debug)
        return raw

In [19]:
def fix_inconsistent_amounts(bill_items, tolerance=1e-2):
    for item in bill_items:
        rate = item.item_rate or 0.0
        qty = item.item_quantity or 0.0
        amt = item.item_amount or 0.0

        if rate > 0 and qty > 0:
            expected = rate * qty
            if abs(expected - amt) > tolerance:
                # trust math more than noisy amount
                item.item_amount = round(expected, 2)
    return bill_items


def clean_items(bill_items):
    seen = set()
    cleaned = []
    for item in bill_items:
        name = (item.item_name or "").upper().strip()
        key = (name, item.item_amount, item.item_rate, item.item_quantity)

        # drop obvious totals
        if "TOTAL" in name:
            continue

        # drop exact duplicates
        if key in seen:
            continue
        seen.add(key)

        cleaned.append(item)
    return cleaned


In [55]:
doc = fitz.open(PDF_PATH)

In [56]:
print(len(doc))

3


In [57]:
doc

Document('train_sample_10.pdf')

In [21]:
pagewise = []
total_items = 0
total_input = total_output = total_tok = 0

In [58]:
PDF_PATH

'train_sample_10.pdf'

In [59]:
text = extract_text_digital(PDF_PATH, 2)

In [60]:
text

'Name : Mrs. GOPA DUTTA IP No : AMHLIP398580 Bill No : INT2043376\nPayer Payable\nSL. Service Service Name Alias Code Dr. Name Start Date End Date Qty/Duration RefTariff Dis( %)Amount(`)\nNo code ( Order No. ) ( in mins )\nConsultation(999311 )\nConsultation\n1 19631 IP CONSULTATION Dr. Soumen 1.00 1,000.00 0 1,000.00\nCHARGES Kar (\nORTHOPAEDIC\nS )\n2 19631 IP CONSULTATION Dr. Soumen 1.00 1,000.00 0 1,000.00\nCHARGES Kar (\nORTHOPAEDIC\nS )\n3 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n4 19631 IP CONSULTATION Dr. Nilanchal 1.00 1,000.00 0 1,000.00\nCHARGES Chakraborty (\nCRITICAL\nCARE )\n5 19631 IP CONSULTATION Dr. Nilanchal 1.00 1,000.00 0 1,000.00\nCHARGES Chakraborty (\nCRITICAL\nCARE )\n6 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n7 19631 IP CONSULTATION Dr. RANJAN 1.00 1,000.00 0 1,000.00\nCHARGES KAMILYA (\nORTHOPAEDIC\nS )\n8 19631 IP CONSULTATION Dr. Bikram Das 1.00 1,00

In [61]:
lst = []

In [32]:
for idx in range(len(doc)):
    page = idx + 1
    text = extract_text_digital(PDF_PATH, idx)

    if len(text.strip()) > 30:
        print(f"Page {page} → digital text detected")
        invoice_text = text
    else:
        print(f"Page {page} → scanned/handwritten detected, running Vision OCR")
        b64 = page_to_image_b64(PDF_PATH, idx, all_images=None)
        invoice_text = ocr_page_with_vision(b64)

    structured_json_str = extract_structured_from_text(invoice_text)
    print(f"Extracted JSON from Page {page}:\n", structured_json_str)
    structured_json = json.loads(structured_json_str)
    lst.append(structured_json)

    try:
       
        validated_output = validate_invoice_json(structured_json)
        pli = validated_output.data.pagewise_line_items

        if not pli:
          print(f"⚠️ Page {page}: no pagewise_line_items found, skipping this page")
        continue  # go to next page
        
        page_obj = pli[0]

    
        page_obj.bill_items = fix_inconsistent_amounts(page_obj.bill_items)
        page_obj.bill_items = clean_items(page_obj.bill_items)
        pagewise.append(page_obj)
        pagewise.append(validated_output.data.pagewise_line_items[0])
        total_items += validated_output.data.total_item_count
        total_input += validated_output.data.token_usage.input_tokens
        total_output += validated_output.data.token_usage.output_tokens
        total_tok += validated_output.data.token_usage.total_tokens
        print(f"✅ Page {page} validated successfully")

    except ValidationError as e:
        print(f"❌ Validation failed on page {page}:", e)
        print("Skipping this page due to bad format")
        continue

Page 1 → scanned/handwritten detected, running Vision OCR
Extracted JSON from Page 1:
 {
  "is_success": true,
  "data": {
    "pagewise_line_items": [
      {
        "page_no": 1,
        "page_type": "Bill Detail",
        "bill_items": [
          {
            "item_name": "Consultation Charge | DR PREETHI MARY JOSEPH",
            "item_amount": 1950.0,
            "item_rate": 300.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Consultation Charge | DR VIDYA PREMACHANDRAN",
            "item_amount": 200.0,
            "item_rate": 200.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Doctors Visiting Fee | DR E SALL KUMAR",
            "item_amount": 1200.0,
            "item_rate": 300.0,
            "item_quantity": 4.0
          },
          {
            "item_name": "RR 2 - Room Rent Step Down Icu",
            "item_amount": 1000.0,
            "item_rate": 1000.0,
            "item_quantity": 1.0

In [77]:
pagewise = []
lst = []
total_input = total_output = total_tok = 0

# If you want images for OCR, precompute once (optional)
# all_images = convert_from_path(PDF_PATH, poppler_path=POPPLER_PATH)

for idx in range(len(doc)):   # or range(len(all_images)) depending on your source
    page = idx + 1
    text = extract_text_digital(PDF_PATH, idx)

    if len(text.strip()) > 30:
        print(f"Page {page} → digital text detected")
        invoice_text = text
    else:
        print(f"Page {page} → scanned/handwritten detected, running Vision OCR")
        # If you have all_images precomputed, pass them here:
        # b64 = page_to_image_b64(PDF_PATH, idx, all_images=all_images)
        b64 = page_to_image_b64(PDF_PATH, idx, all_images=None)
        invoice_text = ocr_page_with_vision(b64)

    
    structured_json_str = extract_structured_from_text(invoice_text)
    print(f"Extracted JSON from Page {page}:\n", structured_json_str)

    structured_json = json.loads(structured_json_str)
    lst.append(structured_json)   # raw JSON if you want to inspect later

    try:
        validated_output = validate_invoice_json(structured_json)
        pli = validated_output.data.pagewise_line_items

        if not pli:
            print(f"⚠️ Page {page}: no pagewise_line_items found, skipping this page")
            continue  # <-- now inside the if block

        # Take the first (and only) page entry for this page
        page_obj = pli[0]
        page_obj.page_no = page 

        # Fix math and clean duplicates/totals
        page_obj.bill_items = fix_inconsistent_amounts(page_obj.bill_items)
        page_obj.bill_items = clean_items(page_obj.bill_items)

        # Store cleaned page result
        pagewise.append(page_obj)

        # Accumulate token usage from model (if you care about it)
        total_input += validated_output.data.token_usage.input_tokens
        total_output += validated_output.data.token_usage.output_tokens
        total_tok += validated_output.data.token_usage.total_tokens

        print(f"✅ Page {page} validated successfully")

    except ValidationError as e:
        print(f"❌ Validation failed on page {page}:", e)
        print("Skipping this page due to bad format")
        continue


Page 1 → digital text detected
Extracted JSON from Page 1:
 {"is_success": true, "data": {"pagewise_line_items": [{"page_no": "1", "page_type": "Bill Detail", "bill_items": [{"item_name": "Consultation(999311 )", "item_amount": 79750.0, "item_rate": 79750.0, "item_quantity": 1.0}, {"item_name": "Equipment(999311 )", "item_amount": 213250.0, "item_rate": 213250.0, "item_quantity": 1.0}, {"item_name": "Invasive Procedures(999311 )", "item_amount": 35720.0, "item_rate": 35720.0, "item_quantity": 1.0}, {"item_name": "Investigations(999311 )", "item_amount": 173010.0, "item_rate": 173010.0, "item_quantity": 1.0}, {"item_name": "Medical Records(999311 )", "item_amount": 600.0, "item_rate": 600.0, "item_quantity": 1.0}, {"item_name": "Non Invasive Procedure(999311 )", "item_amount": 141120.0, "item_rate": 141120.0, "item_quantity": 1.0}, {"item_name": "Nutritional and Functional Assessment Charges(999311 )", "item_amount": 600.0, "item_rate": 600.0, "item_quantity": 1.0}, {"item_name": "OT Ch

In [63]:
pagewise[1]

PageLineItems(page_no=2, page_type='Bill Detail', bill_items=[])

In [64]:
len(pagewise)

3

In [78]:
# Recompute total_item_count from cleaned bill items
total_items = sum(len(p.bill_items) for p in pagewise)

final_json = {
    "is_success": True,
    "data": {
        "pagewise_line_items": [p.model_dump() for p in pagewise],
        "total_item_count": total_items,
        "token_usage": {
            "total_tokens": total_tok,
            "input_tokens": total_input,
            "output_tokens": total_output,
        },
    },
}


In [79]:
final_validated = validate_invoice_json(final_json)
print(json.dumps(final_validated.model_dump(), indent=2))


{
  "is_success": true,
  "data": {
    "pagewise_line_items": [
      {
        "page_no": 1,
        "page_type": "Bill Detail",
        "bill_items": [
          {
            "item_name": "Consultation(999311 )",
            "item_amount": 79750.0,
            "item_rate": 79750.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Equipment(999311 )",
            "item_amount": 213250.0,
            "item_rate": 213250.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Invasive Procedures(999311 )",
            "item_amount": 35720.0,
            "item_rate": 35720.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Investigations(999311 )",
            "item_amount": 173010.0,
            "item_rate": 173010.0,
            "item_quantity": 1.0
          },
          {
            "item_name": "Medical Records(999311 )",
            "item_amount": 600.0,
            "item_rate":

In [85]:
final_output = final_validated.model_dump()

In [84]:
print(final_validated.model_dump())

{'is_success': True, 'data': {'pagewise_line_items': [{'page_no': 1, 'page_type': 'Bill Detail', 'bill_items': [{'item_name': 'Consultation(999311 )', 'item_amount': 79750.0, 'item_rate': 79750.0, 'item_quantity': 1.0}, {'item_name': 'Equipment(999311 )', 'item_amount': 213250.0, 'item_rate': 213250.0, 'item_quantity': 1.0}, {'item_name': 'Invasive Procedures(999311 )', 'item_amount': 35720.0, 'item_rate': 35720.0, 'item_quantity': 1.0}, {'item_name': 'Investigations(999311 )', 'item_amount': 173010.0, 'item_rate': 173010.0, 'item_quantity': 1.0}, {'item_name': 'Medical Records(999311 )', 'item_amount': 600.0, 'item_rate': 600.0, 'item_quantity': 1.0}, {'item_name': 'Non Invasive Procedure(999311 )', 'item_amount': 141120.0, 'item_rate': 141120.0, 'item_quantity': 1.0}, {'item_name': 'Nutritional and Functional Assessment Charges(999311 )', 'item_amount': 600.0, 'item_rate': 600.0, 'item_quantity': 1.0}, {'item_name': 'OT Charges(999311 )', 'item_amount': 115000.0, 'item_rate': 115000.

In [86]:
type(final_output)

dict

In [47]:
pagewise

[PageLineItems(page_no='1', page_type='Bill Detail', bill_items=[BillItem(item_name='Sample Item', item_amount=100.0, item_rate=25.0, item_quantity=4.0)]),
 PageLineItems(page_no='1', page_type='Bill Detail', bill_items=[BillItem(item_name='Consultation Charge | DR PREETHI MARY JOSEPH--', item_amount=300.0, item_rate=300.0, item_quantity=1.0), BillItem(item_name='Consultation Charge | DR VIDYA PREMACHANDRAN--', item_amount=200.0, item_rate=200.0, item_quantity=1.0), BillItem(item_name='Doctors Visiting Fee | DR S SALIL KUMAR--', item_amount=250.0, item_rate=250.0, item_quantity=1.0), BillItem(item_name='Doctors Visiting Fee | DR S SALIL KUMAR--', item_amount=1200.0, item_rate=300.0, item_quantity=4.0)])]

In [41]:
page

3

In [35]:
pagewise

[PageLineItems(page_no='1', page_type='Bill Detail', bill_items=[BillItem(item_name='Sample Item', item_amount=100.0, item_rate=25.0, item_quantity=4.0)]),
 PageLineItems(page_no='1', page_type='Bill Detail', bill_items=[BillItem(item_name='Consultation Charge | DR PREETHI MARY JOSEPH--', item_amount=300.0, item_rate=300.0, item_quantity=1.0), BillItem(item_name='Consultation Charge | DR VIDYA PREMACHANDRAN--', item_amount=200.0, item_rate=200.0, item_quantity=1.0), BillItem(item_name='Doctors Visiting Fee | DR S SALIL KUMAR--', item_amount=250.0, item_rate=250.0, item_quantity=1.0), BillItem(item_name='Doctors Visiting Fee | DR S SALIL KUMAR--', item_amount=1200.0, item_rate=300.0, item_quantity=4.0)])]

In [36]:
len(pagewise)

2

In [50]:
validated_output

FinalOutput(is_success=False, data=InvoiceData(pagewise_line_items=[], total_item_count=0, token_usage=TokenUsage(total_tokens=0, input_tokens=0, output_tokens=0)))

In [56]:
len(lst)

3

In [72]:
lst

[{'is_success': True,
  'data': {'pagewise_line_items': [{'page_no': '1',
     'page_type': 'Bill Detail',
     'bill_items': [{'item_name': 'Consultation Charge - DR PREETHI MARY JOSEPH',
       'item_amount': 1950.0,
       'item_rate': 300.0,
       'item_quantity': 1.0},
      {'item_name': 'Consultation Charge - DR VIDYA PREMACHANDRAN',
       'item_amount': 200.0,
       'item_rate': 200.0,
       'item_quantity': 1.0},
      {'item_name': 'Doctors Visiting Fee - DR E SAIL KUMAR',
       'item_amount': 1200.0,
       'item_rate': 300.0,
       'item_quantity': 4.0},
      {'item_name': 'RR 2 -Room Rent Step Down Icu',
       'item_amount': 1000.0,
       'item_rate': 1000.0,
       'item_quantity': 1.0},
      {'item_name': 'Room Rent Bystander Room',
       'item_amount': 900.0,
       'item_rate': 900.0,
       'item_quantity': 1.0},
      {'item_name': 'SG201 -1. Room Rent Ward A',
       'item_amount': 900.0,
       'item_rate': 900.0,
       'item_quantity': 1.0},
      {'it

In [73]:
lst[0]

{'is_success': True,
 'data': {'pagewise_line_items': [{'page_no': '1',
    'page_type': 'Bill Detail',
    'bill_items': [{'item_name': 'Consultation Charge - DR PREETHI MARY JOSEPH',
      'item_amount': 1950.0,
      'item_rate': 300.0,
      'item_quantity': 1.0},
     {'item_name': 'Consultation Charge - DR VIDYA PREMACHANDRAN',
      'item_amount': 200.0,
      'item_rate': 200.0,
      'item_quantity': 1.0},
     {'item_name': 'Doctors Visiting Fee - DR E SAIL KUMAR',
      'item_amount': 1200.0,
      'item_rate': 300.0,
      'item_quantity': 4.0},
     {'item_name': 'RR 2 -Room Rent Step Down Icu',
      'item_amount': 1000.0,
      'item_rate': 1000.0,
      'item_quantity': 1.0},
     {'item_name': 'Room Rent Bystander Room',
      'item_amount': 900.0,
      'item_rate': 900.0,
      'item_quantity': 1.0},
     {'item_name': 'SG201 -1. Room Rent Ward A',
      'item_amount': 900.0,
      'item_rate': 900.0,
      'item_quantity': 1.0},
     {'item_name': 'SG204 -Nursing Ch

In [74]:
print(lst[0])

{'is_success': True, 'data': {'pagewise_line_items': [{'page_no': '1', 'page_type': 'Bill Detail', 'bill_items': [{'item_name': 'Consultation Charge - DR PREETHI MARY JOSEPH', 'item_amount': 1950.0, 'item_rate': 300.0, 'item_quantity': 1.0}, {'item_name': 'Consultation Charge - DR VIDYA PREMACHANDRAN', 'item_amount': 200.0, 'item_rate': 200.0, 'item_quantity': 1.0}, {'item_name': 'Doctors Visiting Fee - DR E SAIL KUMAR', 'item_amount': 1200.0, 'item_rate': 300.0, 'item_quantity': 4.0}, {'item_name': 'RR 2 -Room Rent Step Down Icu', 'item_amount': 1000.0, 'item_rate': 1000.0, 'item_quantity': 1.0}, {'item_name': 'Room Rent Bystander Room', 'item_amount': 900.0, 'item_rate': 900.0, 'item_quantity': 1.0}, {'item_name': 'SG201 -1. Room Rent Ward A', 'item_amount': 900.0, 'item_rate': 900.0, 'item_quantity': 1.0}, {'item_name': 'SG204 -Nursing Charge', 'item_amount': 1500.0, 'item_rate': 500.0, 'item_quantity': 3.0}, {'item_name': 'RR 2 - Stepdown Nursing Charge', 'item_amount': 1000.0, 'it

In [61]:
print(lst[1]) 

{'is_success': True, 'data': {'pagewise_line_items': [{'page_no': '1', 'page_type': 'Bill Detail', 'bill_items': [{'item_name': 'Livi 300mg Tab', 'item_amount': 448.0, 'item_rate': 32.0, 'item_quantity': 14.0}, {'item_name': 'Metunoro', 'item_amount': 124.03, 'item_rate': 17.72, 'item_quantity': 7.0}, {'item_name': 'Pizat 4.5', 'item_amount': 838.12, 'item_rate': 419.06, 'item_quantity': 2.0}, {'item_name': 'Supralite Os Syp', 'item_amount': 289.69, 'item_rate': 289.69, 'item_quantity': 1.0}]}], 'total_item_count': 4, 'token_usage': {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0}}}


In [59]:
lst[1]

{'is_success': True,
 'data': {'pagewise_line_items': [{'page_no': '1',
    'page_type': 'Bill Detail',
    'bill_items': [{'item_name': 'Livi 300mg Tab',
      'item_amount': 448.0,
      'item_rate': 32.0,
      'item_quantity': 14.0},
     {'item_name': 'Metunoro',
      'item_amount': 124.03,
      'item_rate': 17.72,
      'item_quantity': 7.0},
     {'item_name': 'Pizat 4.5',
      'item_amount': 838.12,
      'item_rate': 419.06,
      'item_quantity': 2.0},
     {'item_name': 'Supralite Os Syp',
      'item_amount': 289.69,
      'item_rate': 289.69,
      'item_quantity': 1.0}]}],
  'total_item_count': 4,
  'token_usage': {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0}}}

In [None]:
lst[0]

FinalOutput(is_success=False, data=InvoiceData(pagewise_line_items=[], total_item_count=0, token_usage=TokenUsage(total_tokens=0, input_tokens=0, output_tokens=0)))

In [67]:
lst[1]

{'is_success': True,
 'data': {'pagewise_line_items': [{'page_no': '1',
    'page_type': 'Bill Detail',
    'bill_items': [{'item_name': 'Room Charges',
      'item_amount': 12000.0,
      'item_rate': 3000.0,
      'item_quantity': 4.0},
     {'item_name': 'Surgery Charges',
      'item_amount': 50000.0,
      'item_rate': None,
      'item_quantity': 1.0},
     {'item_name': 'Anesthesia Charges',
      'item_amount': 15000.0,
      'item_rate': None,
      'item_quantity': 1.0}]}],
  'total_item_count': 3,
  'token_usage': {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0}}}

In [66]:
lst[2]

{'is_success': True,
 'data': {'pagewise_line_items': [{'page_no': '1',
    'page_type': 'Bill Detail',
    'bill_items': [{'item_name': 'Painkillers',
      'item_amount': 2000.0,
      'item_rate': 200.0,
      'item_quantity': 10.0},
     {'item_name': 'Antibiotics',
      'item_amount': 1500.0,
      'item_rate': 300.0,
      'item_quantity': 5.0},
     {'item_name': 'Blood Test',
      'item_amount': 1000.0,
      'item_rate': 1000.0,
      'item_quantity': 1.0},
     {'item_name': 'X-Ray',
      'item_amount': 2000.0,
      'item_rate': 2000.0,
      'item_quantity': 1.0},
     {'item_name': 'Nursing Service',
      'item_amount': 2000.0,
      'item_rate': 500.0,
      'item_quantity': 4.0},
     {'item_name': 'Equipment Usage',
      'item_amount': 3000.0,
      'item_rate': 3000.0,
      'item_quantity': 1.0}]}],
  'total_item_count': 6,
  'token_usage': {'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0}}}