In [None]:
# tasks.py (Celery)
from celery import Celery
import cv2
import pytesseract
from pdf2image import convert_from_path
import re
from datetime import datetime

def extract_match(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1) if match else None


def extract_transaction_data(text):
    text = re.sub(r'[|]', ' ', text)
    ref_pattern = r'(?:Reference\s*Code|Transaction\s*Number)[\s\S]{0,50}?(\d{6,})'
    amount_pattern = r'(?:Total\s*Amount|Txn\s*Amount)[\s\S]{0,100}?(\d{1,3}(?:,\d{3})*\.\d{2})'
    date_pattern = r'(?:Payment\s*Time|Date\s*/\s*Time)[\s\S]{0,50}?(\d{1,2}\s+[A-Za-z]{3}\s+\d{4},?\s+[\s\S]{0,10}?\d{1,2}:\d{2}\s*(?:AM|PM))'

    return {
        "amount": extract_match(amount_pattern, text),
        "date": extract_match(date_pattern, text),
        "reference": extract_match(ref_pattern, text),
    }


# app = Celery('tasks', broker='redis://redis:6379/0')

# @app.task
def process_document(file_path, file_type):
    # Convert PDF to image if needed
    if file_type == 'pdf':
        images = convert_from_path(file_path)
        image = images[0]  # Process first page
    else:
        image = cv2.imread(file_path)
    
    # Preprocess image (improves OCR accuracy)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Run OCR
    text = pytesseract.image_to_string(gray)
    
    print("Extracted Text:\n", text)  
    
    # Extract fields
    extracted_data = extract_transaction_data(text)
    
    return extracted_data


In [62]:
from pathlib import Path
from app.services.ocr_service import OCRService


receipts_dir = Path.cwd() / "receipts"

# file_path = receipts_dir / "Screenshot_20251230-163701 - Rajendra Bhandari.jpg"
file_path = receipts_dir / "WhatsApp Image 2025-12-30 at 06.56.49.jpeg"

data = OCRService.extract_transaction_data(str(file_path))

print(data)

ModuleNotFoundError: No module named 'app'

In [None]:
import re
import os
import pytesseract
import cv2
from pytesseract import Output
from pathlib import Path


def normalize_row_text(text):
    # Replace multiple spaces with single space, strip ends
    text = text.replace("\n", " ").replace("\r", "")
    text = " ".join(text.split())
    return text


def extract_match(pattern, text):
    match = re.search(pattern, text, re.IGNORECASE)
    return match.group(1) if match else None


def extract_transaction_data(text):
    text = re.sub(r"[|]", " ", text)

    ref_pattern = r"(?:Reference Code|Transaction Number|Transaction ID|TXN ID)\s*[:\-]?\s*([A-Za-z0-9\-_/]+)"
    amount_pattern = r"(?:Transaction Amount|Txn Amount|Total Amount|Amount|NPR)\s*(?:\([A-Z]{3}\))?\s*([\d,]+(?:\.\d{2})?)"
    charge_pattern = r"(?:Charge|Change)\s*(?:\([A-Z]{3}\))?\s*([\d,]+(?:\.\d{2})?)"

    # Label-based date
    label_based = r"(?:Payment Time|Date\s*/?\s*Time|Transaction Date)\s*[:\-]?\s*([0-9]{1,2}[-\s][A-Za-z]{3}[-\s][0-9]{4},?\s*[0-9]{1,2}:[0-9]{2}\s*(?:AM|PM)?)"

    # Label-optional fallback
    label_optional = r"\b([0-9]{1,2}[-/\.\s]?[A-Za-z]{3}[-/\.\s]?[0-9]{4},?\s*[0-9]{1,2}:[0-9]{2}\s*(?:AM|PM)?)\b"

    # Extract values
    amount = extract_match(amount_pattern, text)
    charge = extract_match(charge_pattern, text)
    reference = extract_match(ref_pattern, text)

    date = extract_match(label_based, text)
    if not date:
        date = extract_match(label_optional, text)

    return {
        "amount": amount,
        "charge": charge if charge else None,
        "date": date,
        "reference": reference,
    }


def main():

    details = {}

    row_tolerance = 10  # pixels

    receipts_dir = Path.cwd() / "receipts"

    image_files = [img for img in os.listdir(receipts_dir)]

    for image_file in image_files:

        lines = []

        img = cv2.imread(receipts_dir / image_file)
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        custom_config = r"--oem 3 --psm 6"
        data = pytesseract.image_to_data(
            gray_img, config=custom_config, output_type=Output.DICT
        )

        for i, txt in enumerate(data["text"]):
            if not txt.strip():
                continue
            y = data["top"][i]
            added = False
            for row in lines:
                if abs(row["y"] - y) < row_tolerance:
                    row["indices"].append(i)
                    added = True
                    break
            if not added:
                lines.append({"y": y, "indices": [i]})

        for row in lines:
            indices = row["indices"]
            row_text = " ".join(data["text"][i] for i in indices)
            # print(row_text)
            row_text = normalize_row_text(row_text)
            required_data = extract_transaction_data(row_text)
            # print(required_data)
            for key, value in required_data.items():
                if value:
                    details[key] = value
        print(details)


if __name__ == "__main__":
    main()

In [None]:
import matplotlib.pyplot as plt
import cv2

# Convert image from BGR (OpenCV default) to RGB (Matplotlib expects RGB)
img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

plt.figure(figsize=(12, 8))
plt.imshow(img_rgb)
plt.axis("off")  # turn off axes
plt.show()


In [None]:
Payment Time | Date / Time                                                28 Dec 2025, 02:37 PM
Total Amount (NPR) | Amount(NPR) | Amount (NPR) | Txn Amount (NPR)        6,500.00
Reference Code | Transaction Number                                       12345678

In [None]:
Payment Time | Date / Time                                                
Total Amount (NPR) | Amount(NPR) | Amount (NPR) | Txn Amount (NPR)        
Reference Code | Transaction Number                                       