In [88]:
import pdfplumber
import re

In [89]:
loc_1 = r"/Users/azerty/Downloads/invoice1-word-example.pdf"

In [90]:
with pdfplumber.open(loc_1) as pdf:
    page = pdf.pages[0]
    text = page.extract_text()
print(text)

INVOICE
INVOICE #1000
DATE: 05/05/2025
BILLED TO: PAYABLE TO:
John Smith Jane Doe
35 Hill Street Name of Company
Jacksonville, FL 62541 Jacksonville, FL 62971
Phone (234) 387-3987 Phone (120) 234-9876
DESCRIPTION QTY PRICE LINE TOTAL
Product Item 1 $10.00 $10.00
Product Item 1 $20.00 $20.00
Product Item 1 $30.00 $30.00
Product Item 1 $40.00 $40.00
Subtotal $100.00
Discount $0.00
Taxes $0.00
Total $100.00
Terms:


In [91]:
from rapidfuzz import process, fuzz

In [92]:
def normalize(s):
    s = s.lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    return s

def fuzzy_extract(text, keywords, score_threshold=50):
    norm_text = normalize(text)
    norm_keywords = [normalize(k) for k in keywords]

    lines = [line.strip() for line in norm_text.splitlines() if line.strip()]

    best_match = process.extractOne(
        query=" ".join(norm_keywords),
        choices=lines,
        scorer=fuzz.partial_ratio
    )

    if best_match and best_match[1] >= score_threshold:
        return best_match[0]

    return None

def extract_number(line):
    if not line:
        return None
    match = re.search(r"\d+[.,]?\d*", line.replace(",", "."))
    return match.group() if match else None

In [93]:
def extract_amount(text, keywords):
    text_lower = text.lower()
    for key in keywords:
        # pattern matches: "Subtotal $100.00" or "Taxes: 0.00"
        pattern = rf"\b{key.lower()}\b\s*[:\-]?\s*\$?\s*(\d+[.,]?\d*)"
        match = re.search(pattern, text_lower)
        if match:
            return match.group(1).replace(",", ".")
    return None

In [107]:
supplier = fuzzy_extract(
    text,
    keywords=["supplier", "vendor", "company", "issued by"]
)
supplier_number = extract_number(supplier)
print("Supplier Number:", supplier_number)

Supplier Number: None


In [95]:
def extract_invoice_number(text):
    text = text.lower()

    patterns = [
        r"\binvoice\s*#\s*(\d+)\b",                 # invoice #1000
        r"\binvoice\s*(no|number|n)?\.?\s*[:\-]?\s*(\d+)\b",
        r"\bfacture\s*(n|num|num√©ro)?\.?\s*[:\-]?\s*(\d+)\b"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            # last capturing group is always the number
            return match.groups()[-1]

    return None

In [96]:
invoice_number = extract_invoice_number(text)
print("Invoice Number:", invoice_number)

Invoice Number: 1000


In [97]:
def extract_invoice_date(text):
    text = text.lower()

    date_patterns = [
        r"\b\d{2}[-/\.]\d{2}[-/\.]\d{4}\b",   # 12-01-2024
        r"\b\d{4}[-/\.]\d{2}[-/\.]\d{2}\b",   # 2024-01-12
        r"\b\d{2}[-/\.]\d{2}\b",              # 12-01 or 12/01
        r"\b\d{1,2}\s?(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s?\d{4}\b",
        r"\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s?\d{1,2},?\s?\d{4}\b"
    ]

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            return match.group()

    return None

In [98]:
invoice_date = extract_invoice_date(text)
print("Invoice Date:", invoice_date)

Invoice Date: 05/05/2025


In [99]:
subtotal_line = fuzzy_extract(
    text,
    keywords=["subtotal", "net amount", "amount before tax"]
)
subtotal = extract_number(subtotal_line)
print("Subtotal:", subtotal)        

Subtotal: 100


In [100]:
tax_line = fuzzy_extract(
    text,
    keywords=["tax","taxes", "vat", "tva"]
)
tax = extract_number(tax_line)
print("Tax:", tax)

Tax: 0


In [101]:
def extract_total(text):
    text = text.lower()

    patterns = [
        r"\btotal\b\s*[:\-]?\s*\$?\s*(\d+[.,]?\d*)",
        r"\bgrand\s+total\b\s*[:\-]?\s*\$?\s*(\d+[.,]?\d*)",
        r"\bamount\s+due\b\s*[:\-]?\s*\$?\s*(\d+[.,]?\d*)"
    ]

    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1).replace(",", ".")

    return None

In [102]:
total = extract_total(text)
print("Total Amount:", total)

Total Amount: 100.00


In [103]:
import json

In [109]:
def extract_invoice_json(text):
    return {
        "supplier": fuzzy_extract(text, ["company", "vendor"]),
        "invoice_number": extract_invoice_number(text),
        "invoice_date": extract_invoice_date(text),
        "subtotal": extract_amount(text, ["subtotal", "net amount", "amount before tax"]),
        "tax": extract_amount(text, ["tax", "vat", "tva"]),
        "total": extract_total(text)

    }

In [110]:
invoice_json = extract_invoice_json(text)
print(invoice_json)

{'supplier': '35 hill street name of company', 'invoice_number': '1000', 'invoice_date': '05/05/2025', 'subtotal': '100.00', 'tax': None, 'total': '100.00'}
