In [51]:
import re
import pytesseract
from pdf2image import convert_from_path
import pandas as pd
import openai
import json  # Import json module for parsing the response

In [52]:
# Set up OpenAI API key
openai.api_key = "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"  # Replace with your actual API key

In [53]:
# Step 1: Extract Text from Scanned PDF using OCR
def extract_text_from_pdf(pdf_path):
    # Convert each page in PDF to an image
    images = convert_from_path(pdf_path)
    text = ""
    # Run OCR on each image page
    for image in images:
        text += pytesseract.image_to_string(image) + "\n"
    return text

In [54]:
def parse_individual_json_objects(content):
    # Remove markdown artifacts and whitespace
    content = re.sub(r"^```json|```$", "", content, flags=re.MULTILINE).strip()

    # Regex to match each JSON object within the list
    json_objects = re.findall(r'\{.*?\}', content, re.DOTALL)
    
    parsed_items = []
    for obj in json_objects:
        try:
            # Parse each individual JSON object and add to the list
            parsed_items.append(json.loads(obj))
        except json.JSONDecodeError as e:
            print("Failed to decode JSON object:", obj)
            continue  # Skip invalid JSON objects
    
    return parsed_items

def parse_and_classify_items_with_llm(text, chunk_size=1000):
    # Split text into manageable chunks
    text_chunks = [text[i:i + chunk_size] for i in range(0, len(text), chunk_size)]
    all_items = []

    for chunk in text_chunks:
        prompt = (
            "Extract item details (name, quantity, price) and classify each item as "
            "medical or non-medical from the following bill text:\n"
            f"{chunk}\n"
            "Provide output in JSON format as an array of objects, with fields: name, quantity, price, category (medical/non-medical). "
            "Do not use additional formatting or text."
        )
        try:
            response = openai.ChatCompletion.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that extracts and classifies items from text."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=500,  # Adjust if necessary
                temperature=0.2
            )
            
            # Extract and parse individual JSON objects from the content
            content = response['choices'][0]['message']['content'].strip()
            items = parse_individual_json_objects(content)
            all_items.extend(items)  # Add parsed items to final list
            
        except Exception as e:
            print(f"Error parsing LLM response: {e}")
    
    return all_items


In [55]:
# Step 3: Structure the Data
def create_dataframe(items):
    df = pd.DataFrame(items)
    return df

In [56]:
pdf_path = "bill_of_items.pdf"  # Specify the actual PDF file path

# Step 1: Extract text from PDF
text = extract_text_from_pdf(pdf_path)
print("Extracted Text:", text)    

Extracted Text: Final Bill

Name: Mrs. BHUVANESHWARI V ai € y
Age/Gender: 27 Y F & Child
Address: NO,23, 2NDMAIN, 2ND STAGE, WOC ROAD, MAHALAKSHMIPURAM POST MR No: fren ‘¥

Location: BANGALORE,KARNATAKA Visit ID;

Doctor: Dr. BHARATHI RAJANNA Admission Date: seetl

Department: Obstetrics & Gynaecology Ward/Bed DEL

Rate Plan: Cradle Rajaji Nagar Gen_25012023 Discharge Date:

Sponsor: Referred By: i} 08 | ATH] RAJANNA.

Bill No: RRAJ-ICR-822(Bill Later) Bill Date:08-06-2024

Charges Ord# Head Description Rate Qty Amount

Packages

08-06-2024 Package Charges Lscs 106,000.00 i 106,000.00
Sub Total: 106,000.00

Diagnostics

08-06-2024 16166221 Lab Tests HCV Tri Dot 495,00 1,00 495.00

08-06-2024 16166221 Lab Tests HIV RAPID 1,731.00 1,00 1,731.00

08-06-2024 16166221 Lab Tests (VDRL) RPR QUALITATIVE- SERUM 440.00 1.00 440.00

08-06-2024 16166221 Lab Tests HBS AG SCREENING(RAPID) 1,327.00 1,00 1,327.00
Sub Total: 3,993.00

Services & Procedures

11-06-2024 Service LACTATION CONSULTATION CHA

In [57]:
# Step 2: Parse and classify items using LLM
items = parse_and_classify_items_with_llm(text)
print("Extracted and Classified Items:", items)

Extracted and Classified Items: [{'name': 'Lscs', 'quantity': 1, 'price': 106000.0, 'category': 'medical'}, {'name': 'HCV Tri Dot', 'quantity': 1, 'price': 495.0, 'category': 'medical'}, {'name': 'HIV RAPID', 'quantity': 1, 'price': 1731.0, 'category': 'medical'}, {'name': '(VDRL) RPR QUALITATIVE- SERUM', 'quantity': 1, 'price': 440.0, 'category': 'medical'}, {'name': 'HBS AG SCREENING(RAPID)', 'quantity': 1, 'price': 1327.0, 'category': 'medical'}, {'name': 'LACTATION CONSULTATION CHARGES', 'quantity': 1, 'price': 1100.0, 'category': 'medical'}, {'name': 'PHYSIOTHERAPY CONSULTATION', 'quantity': 1.0, 'price': 1100.0, 'category': 'medical'}, {'name': 'DIET CHARGES', 'quantity': 3.0, 'price': 3000.0, 'category': 'non-medical'}, {'name': 'DOCUMENT PROCESSING FEE', 'quantity': 1.0, 'price': 1000.0, 'category': 'non-medical'}, {'name': 'DIET COUNCELING CHARGE', 'quantity': 1.0, 'price': 1100.0, 'category': 'non-medical'}, {'name': 'INFECTION PREVENTION IN IPD', 'quantity': 3.0, 'price': 60

In [58]:
# Step 3: Convert to DataFrame for structured data
df = create_dataframe(items)
print("Structured Data:", df)

# Save the output to a CSV file if needed
df.to_csv("classified_items.csv", index=False)

Structured Data:                                                  name  quantity     price  \
0                                                Lscs       1.0  106000.0   
1                                         HCV Tri Dot       1.0     495.0   
2                                           HIV RAPID       1.0    1731.0   
3                       (VDRL) RPR QUALITATIVE- SERUM       1.0     440.0   
4                             HBS AG SCREENING(RAPID)       1.0    1327.0   
5                      LACTATION CONSULTATION CHARGES       1.0    1100.0   
6                          PHYSIOTHERAPY CONSULTATION       1.0    1100.0   
7                                        DIET CHARGES       3.0    3000.0   
8                             DOCUMENT PROCESSING FEE       1.0    1000.0   
9                              DIET COUNCELING CHARGE       1.0    1100.0   
10                        INFECTION PREVENTION IN IPD       3.0     600.0   
11                                    MEDICAL RECORDS      