In [28]:
import os
from google.cloud import vision
import io
import fitz  # PyMuPDF
from PIL import Image

In [31]:
# --- CONFIGURATION ---
CREDENTIALS_FILE = '/Users/ashishrathore/Aarogya-AI/crack-decorator-468911-s1-5ab46e3aea4b.json'
# Hum do files test karenge
PDF_PATH = '/Users/ashishrathore/Aarogya-AI/data/raw_reports/ilide.info-mr-arvind-yadavx27s-lipid-profile-liver-panel-and-kidney-panel-test-result-pr_4546c09626534389bb97fca7eea8e907.pdf'
IMAGE_PATH = '/Users/ashishrathore/Aarogya-AI/data/raw_reports/dab0ecaa3094e050eefc8ec711bc4b42.jpg' 

In [34]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = CREDENTIALS_FILE
client = vision.ImageAnnotatorClient()

In [37]:
# --- THE MASTER FUNCTION ---
def extract_text_from_file(file_path):
    """
    Takes a file path (PDF or Image), extracts text using Google Vision API.
    """
    print(f"\n--- Processing file: {file_path} ---")
    
    file_extension = os.path.splitext(file_path)[1].lower()
    
    # CASE 1: FILE IS A PDF
    if file_extension == '.pdf':
        try:
            doc = fitz.open(file_path)
            all_text = []
            print(f"PDF detected with {len(doc)} pages. Converting pages to images...")
            
            for page_num in range(len(doc)):
                page = doc.load_page(page_num)
                pix = page.get_pixmap(dpi=300) # Higher DPI for better quality
                img_byte_arr = pix.tobytes("png")
                
                # Send to Google Vision
                image = vision.Image(content=img_byte_arr)
                response = client.document_text_detection(image=image)
                
                if response.error.message:
                    raise Exception(response.error.message)
                    
                all_text.append(response.full_text_annotation.text)
                print(f"Page {page_num + 1} processed.")
            
            doc.close()
            return "\n\n--- PAGE BREAK ---\n\n".join(all_text)
            
        except Exception as e:
            return f"Error processing PDF: {e}"

    # CASE 2: FILE IS AN IMAGE
    elif file_extension in ['.jpg', '.jpeg', '.png']:
        try:
            print("Image file detected.")
            with io.open(file_path, 'rb') as image_file:
                content = image_file.read()
            
            image = vision.Image(content=content)
            response = client.document_text_detection(image=image)

            if response.error.message:
                raise Exception(response.error.message)
                
            return response.full_text_annotation.text
            
        except Exception as e:
            return f"Error processing Image: {e}"
            
    # CASE 3: UNSUPPORTED FILE
    else:
        return f"Unsupported file type: {file_extension}"

In [41]:
# --- LET'S TEST OUR MASTER FUNCTION ---

# Test 1: Process the PDF file
pdf_extracted_text = extract_text_from_file(PDF_PATH)
print("\n\n--- ✅ FINAL EXTRACTED TEXT FROM PDF ---")
print(pdf_extracted_text)
print("--------------------------------------")



--- Processing file: /Users/ashishrathore/Aarogya-AI/data/raw_reports/ilide.info-mr-arvind-yadavx27s-lipid-profile-liver-panel-and-kidney-panel-test-result-pr_4546c09626534389bb97fca7eea8e907.pdf ---
PDF detected with 7 pages. Converting pages to images...
Page 1 processed.
Page 2 processed.
Page 3 processed.
Page 4 processed.
Page 5 processed.
Page 6 processed.
Page 7 processed.


--- ✅ FINAL EXTRACTED TEXT FROM PDF ---
Dr Lal PathLabs
S61-LPL-HANUMAN ROAD (MAIN LAB)
ESKAY HOUSE, 54, HANUMAN ROAD, NEW
DELHI-110001
DELHI
(Hony) Brig. Dr. Arvind Lal
M.B.B.S., D.C.P.
Paduna Sheri
FMR HONORARY PHYSICIAN TO THE PRESIDENT OF INDIA,
National Reference Lab: Sector-18, Block-E, Rohini, New Delhi - 110 085
Main Lab: 54, Eskay House, Hanuman Road, New Delhi - 110 001
Tel: 011-3040-3210, Fax: 011 - 3040-3204
E-mail: lalpathlabs@lalpathlabs.com Web: www.lalpathlabs.com
NAHL ACCREDITED
M0061
cap
ACCREDITED
BSI
1509001:2008
FS 60411
Vandana lal
Dr. Vandana Lal
M.D (PATH), IFCAP
Chief of Pathology
S

In [43]:
# Test 2: Process the Image file
image_extracted_text = extract_text_from_file(IMAGE_PATH)
print("\n\n--- ✅ FINAL EXTRACTED TEXT FROM IMAGE ---")
print(image_extracted_text)
print("---------------------------------------")


--- Processing file: /Users/ashishrathore/Aarogya-AI/data/raw_reports/dab0ecaa3094e050eefc8ec711bc4b42.jpg ---
Image file detected.


--- ✅ FINAL EXTRACTED TEXT FROM IMAGE ---
DRLOGY PATHOLOGY LAB
Accurate Caring | Instant
0123456789 | 0912345678
drlogypathlab@drlogy.com
105-108, SMART VISION COMPLEX, HEALTHCARE ROAD, OPPOSITE HEALTHCARE COMPLEX. MUMBAI - 689578
Yashvi M. Patel
Age: 21 Years
Sex: Female
UHID: 556
Investigation
Sample Type
AST (SGOT)
IFCC without P5P
Sample Collected At:
125, Shiv complex, S G Road, Mumbai
Sample Collected By: Mr Suresh
Ref. By: Dr. Hiren Shah
LIVER FUNCTION TEST (LFT)
Result
Serum (2 ml)
45.00
High
Reference Value
www.drlogy.com
035545 62336 78 1
Registered on: 02:31 PM 02 Dec, 2X
Collected on: 03:11 PM 02 Dec, 2X
Reported on: 04:35 PM 02 Dec, 2X
TAT: 1 day (Normal: 1-3 days)
15.00 -40.00
Unit
U/L
ALT (SGPT)
55.00
High
10.00-49.00
U/L
IFCC without P5P
AST ALT Ratio
2.50
High
< 1.00
Calculated
GGTP
75.00
High
0.00-73.00
U/L
IFCC
Alkaline Phosphatase (A