<a href="https://colab.research.google.com/github/AKDW26/Invoice-Extraction/blob/main/OCR_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pytesseract pillow kaggle

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13


In [2]:
pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m58.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.3


In [3]:
import pytesseract
import re
import os
import json
from PIL import Image
from pathlib import Path

In [4]:
# Setup Kaggle credentials
def setup_kaggle():
    # Create kaggle directory
    kaggle_dir = Path.home() / '.kaggle'
    kaggle_dir.mkdir(exist_ok=True)

    # Create kaggle.json file
    credentials = {
        "username": "akshaydwivedi26",
        "key": "c82930f8e47af96095e0257c1140d5c6"
    }

    kaggle_file = kaggle_dir / 'kaggle.json'
    with open(kaggle_file, 'w') as f:
        json.dump(credentials, f)

    # Set permissions
    os.chmod(kaggle_file, 0o600)

    # Set environment variables
    os.environ['KAGGLE_USERNAME'] = "akshaydwivedi26"
    os.environ['KAGGLE_KEY'] = "97593a000178282942181065d850ef46"

    print("Kaggle setup complete!")

In [5]:
# Download dataset
def get_data():
    setup_kaggle()
    import kaggle

    try:
        kaggle.api.dataset_download_files('urbikn/sroie-datasetv2', path='./data', unzip=True)
        print("Data downloaded!")
    except Exception as e:
        print(f"Download failed: {e}")
        print("You can manually download from: https://www.kaggle.com/datasets/urbikn/sroie-datasetv2")

In [6]:
# Extract text from image
def get_text(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

In [7]:
# find invoice number
def find_invoice_number(text):
    patterns = [r'invoice.*?(\d+)', r'inv.*?(\d+)', r'#(\d+)']
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            return match.group(1)
    return "Not found"

In [8]:
# Find date
def find_date(text):
    pattern = r'(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})'
    match = re.search(pattern, text)
    if match:
        return match.group(1)
    return "Not found"

In [9]:
# Find total amount
def find_total(text):
    patterns = [r'total.*?(\d+\.?\d*)', r'\$(\d+\.?\d*)']
    for pattern in patterns:
        match = re.search(pattern, text.lower())
        if match:
            return match.group(1)
    return "Not found"

In [10]:
# Main extraction function
def extract_invoice_data(image_path):
    # Get text from image
    text = get_text(image_path)

    # Extract fields
    result = {
        'invoice_number': find_invoice_number(text),
        'date': find_date(text),
        'total': find_total(text),
        'line_items': find_line_items(text),  # NEW: Line items
        'full_text': text[:200] + "..." if len(text) > 200 else text
    }

    return result

In [11]:
# Extract line items
def find_line_items(text):
    lines = text.split('\n')
    line_items = []

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Pattern 1: Item Qty Price Total
        match1 = re.search(r'([A-Za-z\s]+)\s+(\d+)\s+(\d+\.?\d*)\s+(\d+\.?\d*)', line)
        # Pattern 2: Item $Price x Qty = $Total
        match2 = re.search(r'([A-Za-z\s]+)\s+\$(\d+\.?\d*)\s+x\s+(\d+)\s+=\s+\$(\d+\.?\d*)', line)

        if match1:
            line_items.append({
                'item': match1.group(1).strip(),
                'qty': match1.group(2),
                'price': match1.group(3),
                'total': match1.group(4)
            })
        elif match2:
            line_items.append({
                'item': match2.group(1).strip(),
                'qty': match2.group(3),
                'price': match2.group(2),
                'total': match2.group(4)
            })

    return line_items

In [12]:
# Convert PDF to images
def pdf_to_images(pdf_path):
    try:
        import fitz  # PyMuPDF
        doc = fitz.open(pdf_path)
        images = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap()
            img_path = f"page_{page_num + 1}.png"
            pix.save(img_path)
            images.append(img_path)

        doc.close()
        print(f"Converted PDF to {len(images)} images")
        return images
    except:
        print("Install PyMuPDF: pip install PyMuPDF")
        return []

In [13]:
# Add custom field extraction
def add_custom_field(text, field_name, pattern):
    match = re.search(pattern, text.lower())
    if match:
        return match.group(1)
    return "Not found"

In [14]:
# Train model (simple pattern learning)
def train_model(training_data):
    """
    Training data format: [{'text': '...', 'labels': {'field': 'value'}}]
    This learns new patterns from training examples
    """
    learned_patterns = {}

    for example in training_data:
        text = example['text']
        labels = example['labels']

        for field, value in labels.items():
            if field not in learned_patterns:
                learned_patterns[field] = []

            # Find where the value appears in text and create pattern
            if value in text.lower():
                # Simple pattern learning - find words before the value
                words_before = text.lower().split(value.lower())[0].split()
                if words_before:
                    pattern = f"{words_before[-1]}.*?([a-z0-9\s]+)"
                    learned_patterns[field].append(pattern)

    return learned_patterns

In [15]:
# Use trained patterns
def extract_with_trained_model(text, learned_patterns):
    results = {}
    for field, patterns in learned_patterns.items():
        for pattern in patterns:
            match = re.search(pattern, text.lower())
            if match:
                results[field] = match.group(1).strip()
                break
        if field not in results:
            results[field] = "Not found"

    return results

In [17]:
# Example usage:
if __name__ == "__main__":
    # Download data
    get_data()

    # Test with an image
    image_path = "/content/data/SROIE2019/test/img/X51005200931.jpg"

    # Extract basic fields
    data = extract_invoice_data(image_path)
    print("Extracted Data:", data)

    # Add custom field (example: company name)
    text = get_text(image_path)
    company = add_custom_field(text, "company", r'company.*?([a-z\s]+)')
    print("Company:", company)

Kaggle setup complete!
Dataset URL: https://www.kaggle.com/datasets/urbikn/sroie-datasetv2
Data downloaded!
Extracted Data: {'invoice_number': 'Not found', 'date': '09/02/2078', 'total': '411.50', 'line_items': [{'item': 'GA', 'qty': '4', 'price': '85.00', 'total': '6.00'}, {'item': 'Ge', 'qty': '4', 'price': '1900', 'total': '4'}], 'full_text': ' \n\nPERNIAGAAN ZHENG HU!\nJMOs25956-V\nNO.8@ JALAN PERMAS 9/6\nBANDAR BARU PERMAS JAYA\n81780 JOHOR BAHRU\nTEL: 07-386 7524 FAX : 07-388 3798\nGST NO : DoNROOBGa24\n\nSIMPLIFIED TAX INVOICE\n\n \n\n  \n\n \n\nSalesper...'}
Company: Not found
