<a href="https://colab.research.google.com/github/ChiroDeniro/ai-ml-projecten/blob/main/Factuurcheckerv2_ocr.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install -y poppler-utils
!pip install PyPDF2 pdfplumber pdf2image pytesseract


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 1s (189 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 121703 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.8-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━

Mount Drive

In [6]:
from google.colab import drive
import os
drive.mount('/content/drive')

PDF_FOLDER = '/content/drive/MyDrive/AI_Data/Facturen'
print("PDF folder:", PDF_FOLDER)


Mounted at /content/drive
PDF folder: /content/drive/MyDrive/AI_Data/Facturen


Text extractors laden

In [2]:
import re
import os
from PyPDF2 import PdfReader
import pdfplumber
from pdf2image import convert_from_path
import pytesseract

# Tesseract config
pytesseract.pytesseract.tesseract_cmd = "/usr/bin/tesseract"

def extract_text_pypdf2(path):
    """Basic extractor (slecht voor kolommen, goed voor simpele facturen)"""
    try:
        reader = PdfReader(path)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
        return text
    except:
        return ""


def extract_text_pdfplumber(path):
    """Beter voor kolommen en tabel-opgemaakte facturen"""
    text = ""
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
        return text
    except:
        return ""


def extract_text_ocr(path):
    """Fallback: OCR alle pagina’s als afbeeldingen"""
    text = ""
    try:
        images = convert_from_path(path)
        for img in images:
            ocr_text = pytesseract.image_to_string(img)
            text += ocr_text + "\n"
        return text
    except Exception as e:
        print("OCR error:", e)
        return ""


def extract_text_full(path):
    """Combineert ALLE extractie-methodes"""
    t1 = extract_text_pypdf2(path)
    t2 = extract_text_pdfplumber(path)

    # Gebruik alleen OCR als beide letterlijk bijna lege tekst opleveren
    if len(t1.strip()) < 50 and len(t2.strip()) < 50:
        t3 = extract_text_ocr(path)
    else:
        t3 = ""

    combined = t1 + "\n" + t2 + "\n" + t3
    return combined


Benodigde velden en synoniemen

In [3]:
REQUIRED_FIELDS = {
    "factuurnummer": r"(factuurnummer|invoice nr|invoice number|factuur nr)[^\n:]*[:\s]*([A-Z0-9\-/\.]+)",
    "factuurdatum": r"(factuurdatum|invoice date|datum)[^\n:]*[:\s]*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})",
    "leveringsdatum": r"(leveringsdatum|delivery date)[^\n:]*[:\s]*([0-9]{1,2}[/-][0-9]{1,2}[/-][0-9]{2,4})",
    "verkoper": r"(bedrijf|seller|leverancier|verkoper|from)[^\n]*",
    "klant": r"(klant|customer|to)[^\n]*",
    "btw_nummer": r"(btw|vat)[^\n:]*[:\s]*([A-Z]{2}[A-Z0-9]+)",
    "totaal_bedrag": r"(totaal|total|amount due)[^\n]*?([0-9]+[\.,][0-9]{2})"
}


Checker met drie methodes

In [4]:
def check_invoice(pdf_path):
    text = extract_text_full(pdf_path)

    result = {"bestand": os.path.basename(pdf_path), "gevonden": {}, "ontbreekt": []}

    for veld, pattern in REQUIRED_FIELDS.items():
        match = re.search(pattern, text, re.IGNORECASE)
        result["gevonden"][veld] = match.group(2) if match and len(match.groups()) > 1 else None
        if not result["gevonden"][veld]:
            result["ontbreekt"].append(veld)

    return result


PDF's inlezen uit drive

In [7]:
def check_all_pdfs_in_folder(folder):
    results = []
    for filename in os.listdir(folder):
        if filename.lower().endswith(".pdf"):
            path = os.path.join(folder, filename)
            print("Verwerken:", filename)
            results.append(check_invoice(path))
    return results

all_results = check_all_pdfs_in_folder(PDF_FOLDER)
all_results


Verwerken: boekstra_factuur_5147798.pdf


[{'bestand': 'boekstra_factuur_5147798.pdf',
  'gevonden': {'factuurnummer': '5147798',
   'factuurdatum': '06-11-2025',
   'leveringsdatum': None,
   'verkoper': None,
   'klant': None,
   'btw_nummer': None,
   'totaal_bedrag': '29,70'},
  'ontbreekt': ['leveringsdatum', 'verkoper', 'klant', 'btw_nummer']}]

OCR Output

In [8]:
text_ocr = extract_text_ocr("/content/drive/MyDrive/AI_Data/Facturen/boekstra_factuur_5147798.pdf")
print(text_ocr)


Boekstra

Postbus 7

7440 AA NIJVERDAL
The Netherlands

B O E KST RA BTW: NL815500294B01

KvK: 04061579
IBAN: NL29BUNQ2032275643
klantenservice@boekstra.nl

Factuur

Christiaan van Rhijn

Sint-jacobstraat 109

3011 DK Rotterdam
Datum: 06-11-2025
Factuurnummer: 5147798

Aantal Schrijver Titel Stukprijs Prijs
. Cybernetics, Second Edition: or Control
1 Wiener, Norbert and €29,70 €29,70
Communication in the Animal and the
Machine

EAN 9781684931156

Verzendkosten: € 0,00
Aantal orderregels: 1 Totaalprijs: € 29,70
NL-BTW 9%: € 2,45

Deze factuur is reeds door u betaald op 06-11-2025.

Pagina 1/1 www.boekstra.nl




Smarter Parser op OCR

In [11]:
import re

def smarter_extract(text):
    info = {
        "factuurnummer": None,
        "factuurdatum": None,
        "leveringsdatum": None,
        "verkoper": None,
        "klant": None,
        "btw_nummer": None,
        "totaal_bedrag": None
    }

    # ------------------------------
    # FACTUURNUMMER
    # ------------------------------
    patterns_factnr = [
        r"factuurnummer[:\s]+([A-Z0-9\-\.]+)",
        r"factuur\s*nr[:\s]+([A-Z0-9\-\.]+)",
        r"invoice (no|number)[:\s]+([A-Z0-9\-\.]+)",
        r"\bfactuur\b[^\n]*?([0-9]{5,})"
    ]
    for pat in patterns_factnr:
        m = re.search(pat, text, re.I)
        if m:
            info["factuurnummer"] = m.group(m.lastindex)
            break

    # ------------------------------
    # FACTUURDATUM
    # ------------------------------
    date_regex = r"([0-9]{2}[-/][0-9]{2}[-/][0-9]{4})"

    if m := re.search(r"datum[:\s]+" + date_regex, text, re.I):
        info["factuurdatum"] = m.group(1)
    else:
        first_date = re.search(date_regex, text)
        if first_date:
            info["factuurdatum"] = first_date.group(1)

    # ------------------------------
    # BTW NUMMER
    # ------------------------------
    vat_pattern = r"\b(NL|BE|DE|FR|ES|IT|PT|GB|DK|SE|FI|NO|PL)[A-Z0-9]{6,14}\b"
    m = re.search(vat_pattern, text, re.I)
    if m:
        info["btw_nummer"] = m.group(0)

    # ------------------------------
    # TOTAALBEDRAG
    # ------------------------------
    amount_patterns = [
        r"totaalprijs[:\s€]*([0-9]+[\.,][0-9]{2})",
        r"totaal[:\s€]*([0-9]+[\.,][0-9]{2})",
        r"amount due[:\s€]*([0-9]+[\.,][0-9]{2})",
        r"€\s*([0-9]+[\.,][0-9]{2})"
    ]
    for pat in amount_patterns:
        m = re.search(pat, text, re.I)
        if m:
            info["totaal_bedrag"] = m.group(1)
            break

    # ------------------------------
    # ADRESBLOKKEN
    # ------------------------------
    lines = [l.strip() for l in text.split("\n") if l.strip()]
    postcode_regex = r"\b[0-9]{4}\s?[A-Z]{2}\b"

    # filter regels die duidelijk geen adres zijn
    filtered = [
        l for l in lines
        if not re.search(r"(totaal|prijs|€|datum|factuur|btw|kvk|iban)", l, re.I)
    ]

    address_blocks = []
    for i, line in enumerate(filtered):
        if re.search(postcode_regex, line):
            block = []
            # Pak tot 3 regels boven en 3 onder de postcode
            for j in range(max(0, i-3), min(len(filtered), i+3)):
                block.append(filtered[j])
            address_blocks.append("\n".join(block))

    # eerste blok is vrijwel altijd verkoper
    if len(address_blocks) >= 1:
        info["verkoper"] = address_blocks[0]

    # tweede blok is klant
    if len(address_blocks) >= 2:
        info["klant"] = address_blocks[1]

    return info


In [12]:
ocr_text = """
oekstra

Postbus 7

7440 AA NIJVERDAL
The Netherlands

B O E KST RA BTW: NL815500294B01

KvK: 04061579
IBAN: NL29BUNQ2032275643
klantenservice@boekstra.nl

Factuur

Christiaan van Rhijn

Sint-jacobstraat 109

3011 DK Rotterdam
Datum: 06-11-2025
Factuurnummer: 5147798

Aantal Schrijver Titel Stukprijs Prijs
. Cybernetics, Second Edition: or Control
1 Wiener, Norbert and €29,70 €29,70
Communication in the Animal and the
Machine

EAN 9781684931156

Verzendkosten: € 0,00
Aantal orderregels: 1 Totaalprijs: € 29,70
NL-BTW 9%: € 2,45

Deze factuur is reeds door u betaald op 06-11-2025.

 """
print(smarter_extract(ocr_text))


{'factuurnummer': '5147798', 'factuurdatum': '06-11-2025', 'leveringsdatum': None, 'verkoper': 'oekstra\nPostbus 7\n7440 AA NIJVERDAL\nThe Netherlands\nklantenservice@boekstra.nl', 'klant': 'klantenservice@boekstra.nl\nChristiaan van Rhijn\nSint-jacobstraat 109\n3011 DK Rotterdam\n. Cybernetics, Second Edition: or Control\nCommunication in the Animal and the', 'btw_nummer': 'NL815500294B01', 'totaal_bedrag': '29,70'}
