In [2]:
pip install pytesseract pillow opencv-python pandas


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

In [9]:
from PIL import Image
import cv2
import re

def preprocess_image(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, threshold = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
    return threshold

def extract_text(image_path):
    processed = preprocess_image(image_path)
    text = pytesseract.image_to_string(processed)
    return text


In [16]:
# classify_receipt.py
import os
import pandas as pd
# from utils import extract_text




def classify_type(text):
    text = text.lower()
    if "invoice" in text:
        return "Invoice"
    elif "bill to" in text or "bill no" in text:
        return "Bill"
    elif "receipt" in text:
        return "Receipt"
    else:
        return "Unknown"

def extract_fields(text):
    text_lower = text.lower()

    # --- 📅 DATE Patterns ---
    date_patterns = [
        r'\b\d{2}[\/\-\.]\d{2}[\/\-\.]\d{2,4}\b',      # 12/04/2024 or 12-04-2024
        r'\b\d{4}[\/\-\.]\d{2}[\/\-\.]\d{2}\b',        # 2024/04/12
        r'\b\d{2} [A-Za-z]{3,9} \d{2,4}\b',            # 12 April 2024
        r'\b[A-Za-z]{3,9} \d{1,2},? \d{4}\b',          # April 12, 2024
    ]
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            date = match.group()
            break

    # --- 💰 AMOUNT Patterns ---
    amount_patterns = [
        r'\b(?:total|amount|amt|grand total|balance)\s*[:\-]?\s*₹?\$?\s*(\d{1,3}(?:[,\d{3}]*)(?:\.\d{2})?)',
        r'₹\s?(\d{1,3}(?:[,\d{3}]*)(?:\.\d{2})?)',     # ₹1,234.56
        r'\$\s?(\d+(?:\.\d{2})?)'                      # $123.45
    ]
    amount = None
    for pattern in amount_patterns:
        match = re.search(pattern, text_lower, re.IGNORECASE)
        if match:
            amount = match.group(1)
            break

    # --- 🏷️ CATEGORY Detection ---
    if any(keyword in text_lower for keyword in ["restaurant", "food", "dining", "cafe", "meal"]):
        category = "Food"
    elif any(keyword in text_lower for keyword in ["flight", "uber", "taxi", "bus", "travel", "trip", "train"]):
        category = "Travel"
    elif any(keyword in text_lower for keyword in ["movie", "theater", "concert", "netflix", "event", "entertainment"]):
        category = "Entertainment"
    else:
        category = "Other"

    return {
        "date": date if date else "Not found",
        "amount": amount if amount else "Not found",
        "category": category
    }


def analyze_receipt(image_path):
    text = extract_text(image_path)
    receipt_type = classify_type(text)
    fields = extract_fields(text)
    return {
        "file": os.path.basename(image_path),
        "type": receipt_type,
        **fields
    }

def run_batch(folder="input_images"):
    results = []
    for filename in os.listdir(folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            full_path = os.path.join(folder, filename)
            result = analyze_receipt(full_path)
            results.append(result)

    df = pd.DataFrame(results)
    df.to_csv("extracted_data/output.csv", index=False)
    print("✅ Data extracted and saved to extracted_data/output.csv")

if __name__ == "__main__":
    run_batch()


✅ Data extracted and saved to extracted_data/output.csv


In [None]:
import cv2
import pytesseract
from PIL import Image
import os
import pandas as pd
import re

# Set path to Tesseract (adjust if needed)
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# 📸 Step 1: Capture receipt from webcam
def capture_image():
    cap = cv2.VideoCapture(0)
    print("📸 Press SPACE to capture the receipt, or ESC to exit")

    while True:
        ret, frame = cap.read()
        cv2.imshow("Receipt Camera", frame)

        key = cv2.waitKey(1)
        if key == 27:  # ESC to cancel
            print("❌ Capture cancelled.")
            break
        elif key == 32:  # SPACE to capture
            os.makedirs("input_images", exist_ok=True)
            img_path = os.path.join("input_images", "captured_receipt.jpg")
            cv2.imwrite(img_path, frame)
            print(f"✅ Receipt saved as {img_path}")
            cap.release()
            cv2.destroyAllWindows()
            return img_path

    cap.release()
    cv2.destroyAllWindows()
    return None

# 🧠 Step 2: OCR and classification
def extract_text(image_path):
    img = Image.open(image_path)
    return pytesseract.image_to_string(img)

def classify_type(text):
    text = text.lower()
    if "invoice" in text:
        return "Invoice"
    elif "bill to" in text or "bill no" in text:
        return "Bill"
    elif "receipt" in text:
        return "Receipt"
    else:
        return "Unknown"

def extract_fields(text):
    text_lower = text.lower()

    # --- 📅 DATE Patterns ---
    date_patterns = [
        r'\b\d{2}[\/\-\.]\d{2}[\/\-\.]\d{2,4}\b',      # 12/04/2024 or 12-04-2024
        r'\b\d{4}[\/\-\.]\d{2}[\/\-\.]\d{2}\b',        # 2024/04/12
        r'\b\d{2} [A-Za-z]{3,9} \d{2,4}\b',            # 12 April 2024
        r'\b[A-Za-z]{3,9} \d{1,2},? \d{4}\b',          # April 12, 2024
    ]
    date = None
    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            date = match.group()
            break

    # --- 💰 AMOUNT Patterns ---
    amount_patterns = [
        r'\b(?:total|amount|amt|grand total|balance)\s*[:\-]?\s*₹?\$?\s*(\d{1,3}(?:[,\d{3}]*)(?:\.\d{2})?)',
        r'₹\s?(\d{1,3}(?:[,\d{3}]*)(?:\.\d{2})?)',     # ₹1,234.56
        r'\$\s?(\d+(?:\.\d{2})?)'                      # $123.45
    ]
    amount = None
    for pattern in amount_patterns:
        match = re.search(pattern, text_lower, re.IGNORECASE)
        if match:
            amount = match.group(1)
            break

    # --- 🏷️ CATEGORY Detection ---
    if any(keyword in text_lower for keyword in ["restaurant", "food", "dining", "cafe", "meal"]):
        category = "Food"
    elif any(keyword in text_lower for keyword in ["flight", "uber", "taxi", "bus", "travel", "trip", "train"]):
        category = "Travel"
    elif any(keyword in text_lower for keyword in ["movie", "theater", "concert", "netflix", "event", "entertainment"]):
        category = "Entertainment"
    else:
        category = "Other"

    return {
        "date": date if date else "Not found",
        "amount": amount if amount else "Not found",
        "category": category
    }


def analyze_receipt(image_path):
    text = extract_text(image_path)
    receipt_type = classify_type(text)
    fields = extract_fields(text)
    return {
        "file": os.path.basename(image_path),
        "type": receipt_type,
        **fields
    }

# 💾 Step 3: Save to CSV
def save_to_csv(data):
    os.makedirs("extracted_data", exist_ok=True)
    df = pd.DataFrame([data])
    output_csv = "extracted_data/output.csv"
    if os.path.exists(output_csv):
        df.to_csv(output_csv, mode='a', header=False, index=False)
    else:
        df.to_csv(output_csv, index=False)
    print("✅ Data saved to extracted_data/output.csv")

# 🔁 Run the whole process
if __name__ == "__main__":
    img_path = capture_image()
    if img_path:
        result = analyze_receipt(img_path)
        print("🔍 Analysis Result:", result)
        save_to_csv(result)


📸 Press SPACE to capture the receipt, or ESC to exit
✅ Receipt saved as input_images\captured_receipt.jpg
🔍 Analysis Result: {'file': 'captured_receipt.jpg', 'type': 'Unknown', 'date': 'Not found', 'amount': 'Not found', 'category': 'Other'}
✅ Data saved to extracted_data/output.csv


In [17]:
import pandas as pd

# Load the CSV file
csv_file = 'extracted_data/output.csv'
excel_file = 'extracted_data/output.xlsx'

# Convert to Excel
df = pd.read_csv(csv_file)
df.to_excel(excel_file, index=False)

print("✅ CSV converted to Excel:", excel_file)


✅ CSV converted to Excel: extracted_data/output.xlsx
