In [6]:
###OCR Extraction Demo
#- Loads images (multiple)
#- Preprocesses images (utils/preprocess.py)
#- Runs EasyOCR and extracts text
#- Uses utils/extract_fields, layout_match, dedup for field extraction & checks
#- Stores extracted outputs to `data/extracted_text/`#

In [15]:
# ------------------------------
# üìå 1. IMPORTS
# ------------------------------
import os
import cv2
import pickle
import numpy as np

from models.ocr import extract_text_from_image
from utils.extract_fields import extract_all_fields
from utils.preprocess import preprocess_image

print("OCR modules loaded successfully.")

OCR modules loaded successfully.


In [16]:
# ------------------------------
# üìå 2. PATHS
# ------------------------------
BASE = os.path.abspath("..")                 # backend/
DATA_DIR = os.path.join(BASE, "data")
BILLS_DIR = os.path.join(DATA_DIR, "bills")

OUTPUT_PATH = os.path.join(DATA_DIR, "ocr_output.pkl")

print("BASE:", BASE)
print("DATA_DIR:", DATA_DIR)
print("BILLS_DIR:", BILLS_DIR)
print("Saving to:", OUTPUT_PATH)

os.makedirs(DATA_DIR, exist_ok=True)

BASE: D:\Desktop\insurance-claim-checker\backend
DATA_DIR: D:\Desktop\insurance-claim-checker\backend\data
BILLS_DIR: D:\Desktop\insurance-claim-checker\backend\data\bills
Saving to: D:\Desktop\insurance-claim-checker\backend\data\ocr_output.pkl


In [10]:
save_path = "../data/ocr_output.pkl"

with open(save_path, "wb") as f:
    pickle.dump(results, f)

print("Saved:", save_path)


Saved: ../data/ocr_output.pkl


In [17]:
# ------------------------------
# üìå 3. LOAD IMAGES
# ------------------------------
bill_files = [f for f in os.listdir(BILLS_DIR) 
              if f.lower().endswith((".jpg", ".jpeg", ".png"))]

if len(bill_files) == 0:
    raise Exception("‚ùå No images found in data/bills folder!")

print("Found files:", bill_files)

Found files: ['Fraud.jpeg', 'Fraud1.jpeg']


In [20]:
# ------------------------------
# üìå 4. RUN OCR ON EACH IMAGE
# ------------------------------
ocr_data = {}

for filename in bill_files:
    full_path = os.path.join(BILLS_DIR, filename)

    print("Processing:", filename)

    # Convert image to bytes
    with open(full_path, "rb") as f:
        image_bytes = f.read()

    # Run OCR
    text = extract_text_from_image(image_bytes)

    # Extract structured fields
    fields = extract_all_fields(text)

    # Save result
    ocr_data[filename] = {
        "clean_text": text,
        "fields": fields
    }

print("\nOCR Completed for", len(ocr_data), "files.")


Processing: Fraud.jpeg
Processing: Fraud1.jpeg

OCR Completed for 2 files.


In [22]:
import os
import pickle

BASE = r"D:\Desktop\insurance-claim-checker"
DATA_DIR = os.path.join(BASE, "data")

os.makedirs(DATA_DIR, exist_ok=True)

OUTPUT_PATH = os.path.join(DATA_DIR, "ocr_output.pkl")

with open(OUTPUT_PATH, "wb") as f:
    pickle.dump(ocr_data, f)

print("OCR saved to:", OUTPUT_PATH)


OCR saved to: D:\Desktop\insurance-claim-checker\data\ocr_output.pkl
