In [3]:
import os
import re
import json
import time
import pandas as pd
import PIL.Image
import google.generativeai as genai
from tqdm import tqdm  # progress bar

GEMINI_API_KEY = "AIzaSyCB6i38pbDWstBa8X6q26_GPMkYvkIH1UU"
genai.configure(api_key=GEMINI_API_KEY)

def load_image(image_path):
    return PIL.Image.open(image_path)

plain_text_prompt = """
You are an OCR and information extraction engine.
From the given image, extract and organize the information into these fields:
- Patient Name
- Diagnosis
- Test Results
- Date of Examination
- Doctor’s Name

If any field is missing in the image, just write "Not Found" for that field.
Return the output in clean JSON format.
"""

def extract_plain_text_with_retry(image_path, retries=3, delay=5):
    model = genai.GenerativeModel(model_name="gemini-1.5-pro")
    image = load_image(image_path)

    for attempt in range(retries):
        try:
            response = model.generate_content([plain_text_prompt, image])
            return response.text.strip()
        except Exception as e:
            if "429" in str(e) and attempt < retries - 1:
                print(f"⚠️ Rate limit hit. Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print(f"❌ Failed after {attempt+1} attempts: {e}")
                return None

if __name__ == "__main__":
    folder_path = "/content/drive/MyDrive/archive/data/"
    all_data = []

    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

    for filename in tqdm(image_files, desc="Processing Images"):  # Progress bar
        image_path = os.path.join(folder_path, filename)
        extracted_text = extract_plain_text_with_retry(image_path)

        if extracted_text:
            match = re.search(r'\{.*\}', extracted_text, re.DOTALL)
            if match:
                extracted_json = match.group(0)
                try:
                    data = json.loads(extracted_json)
                    data['Filename'] = filename
                    all_data.append(data)
                except json.JSONDecodeError:
                    print(f"⚠️ Failed to decode JSON for {filename}")
            else:
                print(f"⚠️ No JSON found in the extracted text for {filename}")
        else:
            print(f"❌ Extraction failed for {filename}")

    df = pd.DataFrame(all_data)

    if not df.empty:
        columns = ['Filename'] + [col for col in df.columns if col != 'Filename']
        df = df[columns]

    print("✅ Final Extracted DataFrame:")
    print(df)

    output_excel_path = "/content/drive/MyDrive/archive/extracted_data.xlsx"
    df.to_excel(output_excel_path, index=False)
    print(f"✅ Data saved to {output_excel_path}")


Processing Images:   8%|▊         | 1/12 [00:02<00:30,  2.78s/it]

❌ Failed after 1 attempts: 400 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: Request contains an invalid argument.
❌ Extraction failed for 1.jpg




⚠️ Rate limit hit. Retrying in 5 seconds...




⚠️ Rate limit hit. Retrying in 5 seconds...


Processing Images:  42%|████▏     | 5/12 [00:28<00:54,  7.79s/it]

❌ Failed after 3 attempts: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
❌ Extraction failed for Copy of 11.jpg




⚠️ Rate limit hit. Retrying in 5 seconds...




⚠️ Rate limit hit. Retrying in 5 seconds...


Processing Images:  50%|█████     | 6/12 [00:51<01:18, 13.04s/it]

❌ Failed after 3 attempts: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
❌ Extraction failed for Copy of 8.jpg




⚠️ Rate limit hit. Retrying in 5 seconds...




⚠️ Rate limit hit. Retrying in 5 seconds...




⚠️ Rate limit hit. Retrying in 5 seconds...




⚠️ Rate limit hit. Retrying in 5 seconds...


Processing Images: 100%|██████████| 12/12 [01:33<00:00,  7.81s/it]

❌ Failed after 3 attempts: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-pro:generateContent?%24alt=json%3Benum-encoding%3Dint: You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
❌ Extraction failed for Copy of 12.jpg
✅ Final Extracted DataFrame:
         Filename     Patient Name  \
0           2.jpg       M. PRATHNA   
1           3.jpg     C.B. Forward   
2           9.jpg  Joseph McIntyre   
3  Copy of 10.jpg   Mr. CH. SAMUEL   
4   Copy of 9.jpg  Joseph McIntyre   
5  Copy of 15.jpg      Todd Blazys   
6  Copy of 14.jpg       AJAY SETHI   
7  Copy of 13.jpg   John Beckworth   

                                           Diagnosis Test Results  \
0                            Acute GR\nE dehydration    Not Found   
1                                          Not Found    Not Found   
2                                          Not Foun


