In [3]:
import zipfile
import xml.etree.ElementTree as ET

# --- CONFIG ---
file1 = "Zoekwoordenonderzoek.docx"
file2 = "Zoekwoordenonderzoek2.docx"

def get_styles(docx_file):
    styles = []
    with zipfile.ZipFile(docx_file) as docx:
        xml_content = docx.read("word/document.xml")
        tree = ET.fromstring(xml_content)
        # Word XML uses namespaces
        ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

        for r in tree.findall(".//w:r", ns):
            text = "".join([t.text for t in r.findall("w:t", ns) if t.text])
            if not text.strip():
                continue
            rpr = r.find("w:rPr", ns)
            font, size = None, None
            if rpr is not None:
                rfonts = rpr.find("w:rFonts", ns)
                if rfonts is not None:
                    font = rfonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
                rsize = rpr.find("w:sz", ns)
                if rsize is not None:
                    size = rsize.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
            styles.append((text, font, size))
    return styles

styles1 = get_styles(file1)
styles2 = get_styles(file2)

# --- Compare ---
print("Comparing font type/size differences...\n")
for i, (s1, s2) in enumerate(zip(styles1, styles2), start=1):
    text1, font1, size1 = s1
    text2, font2, size2 = s2
    if text1 != text2:
        print(f"Line {i}: Different text -> '{text1}' vs '{text2}'")
    if font1 != font2:
        print(f"Line {i}: Different font -> {font1} vs {font2}")
    if size1 != size2:
        print(f"Line {i}: Different size -> {size1} vs {size2}")

# Handle length mismatch
if len(styles1) != len(styles2):
    print(f"Warning: file1 has {len(styles1)} text runs, file2 has {len(styles2)}")


Comparing font type/size differences...

Line 5: Different font -> Montserrat vs None
Line 6: Different font -> Montserrat vs Arial
Line 7: Different font -> Montserrat vs Arial
Line 8: Different font -> Montserrat vs Arial
Line 9: Different font -> Montserrat vs Arial
Line 10: Different font -> Montserrat vs Arial
Line 11: Different font -> Montserrat vs Arial
Line 12: Different font -> Montserrat vs Arial
Line 13: Different font -> Montserrat vs Arial
Line 14: Different font -> Montserrat vs Arial
Line 15: Different font -> Montserrat vs Arial
Line 16: Different font -> Montserrat vs Arial
Line 17: Different font -> Montserrat vs Arial
Line 18: Different font -> Montserrat vs Arial
Line 19: Different font -> Montserrat vs Arial
Line 20: Different font -> Montserrat vs Arial
Line 21: Different font -> Montserrat vs Arial
Line 22: Different font -> Montserrat vs Arial
Line 23: Different font -> Montserrat vs Arial
Line 24: Different font -> Montserrat vs Arial
Line 25: Different font -

In [8]:
import zipfile
import xml.etree.ElementTree as ET
from docx import Document
from docx.enum.text import WD_COLOR_INDEX

# --- CONFIG ---
file1 = "Zoekwoordenonderzoek.docx"
file2 = "Zoekwoordenonderzoek2.docx"
output_file = "Zoekwoordenonderzoek2_highlighted.docx"

# --- Extract text + font info ---
def get_styles(docx_file):
    styles = []
    with zipfile.ZipFile(docx_file) as docx:
        xml_content = docx.read("word/document.xml")
        tree = ET.fromstring(xml_content)
        ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

        for r in tree.findall(".//w:r", ns):
            text = "".join([t.text for t in r.findall("w:t", ns) if t.text])
            if not text.strip():
                continue
            rpr = r.find("w:rPr", ns)
            font, size = None, None
            if rpr is not None:
                rfonts = rpr.find("w:rFonts", ns)
                if rfonts is not None:
                    font = rfonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
                rsize = rpr.find("w:sz", ns)
                if rsize is not None:
                    size = rsize.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
            styles.append((text, font, size))
    return styles

# --- Extract all runs, including inside tables ---
def get_all_runs(doc):
    runs = []

    def collect_runs_from_paragraphs(paragraphs):
        for para in paragraphs:
            for run in para.runs:
                if run.text.strip():
                    runs.append(run)

    def collect_runs_from_tables(tables):
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    collect_runs_from_paragraphs(cell.paragraphs)
                    collect_runs_from_tables(cell.tables)  # handle nested tables

    collect_runs_from_paragraphs(doc.paragraphs)
    collect_runs_from_tables(doc.tables)
    return runs

# --- Compare and highlight differences ---
def highlight_differences(file1, file2, output_file):
    styles1 = get_styles(file1)
    styles2 = get_styles(file2)

    doc2 = Document(file2)
    runs2 = get_all_runs(doc2)

    print(f"Comparing {len(styles1)} vs {len(styles2)} text runs...")

    for i, (s1, s2) in enumerate(zip(styles1, styles2)):
        if i >= len(runs2):
            break
        text1, font1, size1 = s1
        text2, font2, size2 = s2
        run = runs2[i]
        if text1 != text2 or font1 != font2 or size1 != size2:
            run.font.highlight_color = WD_COLOR_INDEX.YELLOW

    if len(styles1) != len(styles2):
        print(f"Warning: file1 has {len(styles1)} runs, file2 has {len(styles2)}")

    doc2.save(output_file)
    print(f"Saved highlighted file: {output_file}")

# --- Run ---
highlight_differences(file1, file2, output_file)


Comparing 439 vs 439 text runs...
Saved highlighted file: Zoekwoordenonderzoek2_highlighted.docx


In [7]:
import json
from docx import Document

# --- CONFIG ---
target_font = "Montserrat"
file_path = "Zoekwoordenonderzoek2.docx"
output_json = "font_mismatches.json"

# --- Extract all runs (including inside tables) ---
def get_all_runs(doc):
    runs = []
    line_counter = 0

    def collect_runs_from_paragraphs(paragraphs):
        nonlocal line_counter
        for para in paragraphs:
            line_counter += 1
            for run in para.runs:
                if run.text.strip():
                    runs.append((line_counter, run))

    def collect_runs_from_tables(tables):
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    collect_runs_from_paragraphs(cell.paragraphs)
                    collect_runs_from_tables(cell.tables)

    collect_runs_from_paragraphs(doc.paragraphs)
    collect_runs_from_tables(doc.tables)
    return runs

# --- Extract font name from run ---
def get_font_from_run(run):
    if run.font.name:
        return run.font.name

    # Fallback: get font from XML
    r = run._element
    rPr = r.find(".//w:rPr", namespaces={"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"})
    if rPr is not None:
        rFonts = rPr.find("w:rFonts", namespaces={"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"})
        if rFonts is not None:
            return rFonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
    return None

# --- Check font consistency ---
def check_font_consistency(docx_file, target_font):
    doc = Document(docx_file)
    runs = get_all_runs(doc)
    mismatches = []

    for line_num, run in runs:
        font_name = get_font_from_run(run)

        # Skip runs with no detected font
        if not font_name:
            continue

        # Add only mismatched fonts
        if font_name != target_font:
            mismatches.append({
                "line": line_num,
                "text": run.text.strip(),
                "reason": f"Font is '{font_name}' instead of '{target_font}'"
            })

    print(f"Checked {len(runs)} text runs.")
    print(f"Found {len(mismatches)} font mismatches (not '{target_font}').")

    return mismatches

# --- Run check and export to JSON ---
mismatches = check_font_consistency(file_path, target_font)

with open(output_json, "w", encoding="utf-8") as f:
    json.dump(mismatches, f, ensure_ascii=False, indent=4)

print(f"Results saved to {output_json}")


Checked 439 text runs.
Found 434 font mismatches (not 'Montserrat').
Results saved to font_mismatches.json


In [17]:
import json
from docx import Document

def get_all_runs(doc):
    runs = []
    para_counter = 0
    current_section = None  # no default section yet

    def collect_runs_from_paragraphs(paragraphs):
        nonlocal para_counter, current_section
        for para in paragraphs:
            para_counter += 1
            style_name = para.style.name if para.style else ""
            
            # If paragraph is a heading, update section
            if style_name.startswith("Heading"):
                current_section = para.text.strip()
            
            # If no section yet and paragraph has text, treat it as first section
            elif current_section is None and para.text.strip():
                current_section = para.text.strip()
            
            # Add each non-empty run
            for run in para.runs:
                if run.text.strip():
                    runs.append({
                        "paragraph": para_counter,
                        "section": current_section,
                        "run": run
                    })

    def collect_runs_from_tables(tables):
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    collect_runs_from_paragraphs(cell.paragraphs)
                    collect_runs_from_tables(cell.tables)

    collect_runs_from_paragraphs(doc.paragraphs)
    collect_runs_from_tables(doc.tables)
    return runs



# --- Extract font name from run ---
def get_font_from_run(run):
    if run.font.name:
        return run.font.name

    # Fallback: get font from XML
    r = run._element
    rPr = r.find(".//w:rPr", namespaces={"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"})
    if rPr is not None:
        rFonts = rPr.find("w:rFonts", namespaces={"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"})
        if rFonts is not None:
            return rFonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
    return None

# --- Check font consistency ---
def check_font_consistency(docx_file, target_font):
    doc = Document(docx_file)
    run_entries = get_all_runs(doc)
    mismatches = []

    for entry in run_entries:
        font_name = get_font_from_run(entry["run"])
        if not font_name:
            continue
        if font_name != target_font:
            mismatches.append({
                "paragraph": entry["paragraph"],
                "section": entry["section"],
                "text": entry["run"].text.strip(),
                "reason": f"Font is '{font_name}' instead of '{target_font}'"
            })

    print(f"Checked {len(run_entries)} text runs.")
    print(f"Found {len(mismatches)} font mismatches (not '{target_font}').")

    return mismatches

# --- Run check and export to JSON ---
mismatches = check_font_consistency(file_path, target_font)

with open(output_json, "w", encoding="utf-8") as f:
    json.dump(mismatches, f, ensure_ascii=False, indent=4)

print(f"Results saved to {output_json}")


Checked 439 text runs.
Found 434 font mismatches (not 'Montserrat').
Results saved to font_mismatches.json


In [10]:
import json
from reportlab.lib.pagesizes import A4
from reportlab.lib import colors
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet

# ---------- 1. Load JSON mismatches (only first mismatch per section) ----------
def load_mismatches_from_json(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        mismatches = json.load(f)

    seen_sections = set()
    converted = []
    for m in mismatches:
        section = m.get("section", "Geen sectie")

        # Skip if we've already added a mismatch for this section
        if section in seen_sections:
            continue
        seen_sections.add(section)

        # Parse reason to extract found font and expected font
        reason = m.get("reason", "")
        if "instead of" in reason:
            parts = reason.replace("Font is '", "").replace("'", "").split(" instead of ")
            gevonden = parts[0].strip()
            verwacht = parts[1].strip() if len(parts) > 1 else "Montserrat"
        else:
            gevonden = "Onbekend"
            verwacht = "Montserrat"

        converted.append({
            "element": f"Paragraaf {m['paragraph']}",
            "type": "lettertype",
            "verwacht": verwacht,
            "gevonden": gevonden,
            "pagina": 1,  # Could later estimate page
            "ernst": "laag",
            "text": m["text"],
            "section": section
        })

    return converted


# ---------- 2. Console rapport ----------
def generate_console_report(data):
    print("Rapport van afwijkingen:")
    kleurcodes = {"laag": "\033[92m", "gemiddeld": "\033[93m", "hoog": "\033[91m"}
    reset = "\033[0m"
    print(f"{'Element':<15}{'Type':<12}{'Verwacht':<12}{'Gevonden':<12}{'Pagina':<6}{'Ernst'}")
    for item in data:
        kleur = kleurcodes[item['ernst']]
        print(f"{kleur}{item['element']:<15}{item['type']:<12}{item['verwacht']:<12}{item['gevonden']:<12}{item['pagina']:<6}{item['ernst']}{reset}")

# ---------- 3. PDF rapport ----------
def generate_pdf_report(data, pdf_file="rapport_fontcheck.pdf", logo_path=None):
    doc = SimpleDocTemplate(pdf_file, pagesize=A4)
    elements = []
    styles = getSampleStyleSheet()

    # Logo toevoegen (optioneel)
    if logo_path:
        try:
            img = Image(logo_path, width=100, height=50)
            img.hAlign = 'RIGHT'
            elements.append(img)
            elements.append(Spacer(1, 12))
        except Exception as e:
            print(f"Kon logo niet laden: {e}")

    # Titel
    title = Paragraph("Laméco Documentstijl Controle Rapport", styles['Title'])
    elements.append(title)
    elements.append(Spacer(1, 12))

    # Tabeldata
    table_data = [["Element", "Type", "Verwacht", "Gevonden", "Pagina", "Ernst"]]
    for item in data:
        table_data.append([item['element'], item['type'], item['verwacht'], item['gevonden'], str(item['pagina']), item['ernst']])

    table = Table(table_data, repeatRows=1)
    style = TableStyle([
        ('BACKGROUND', (0,0), (-1,0), colors.lightgrey),
        ('GRID', (0,0), (-1,-1), 1, colors.black),
        ('ALIGN', (4,1), (4,-1), 'CENTER')
    ])

    # Kleurcodering per ernst
    for i, item in enumerate(data, start=1):
        if item['ernst'] == 'hoog':
            style.add('BACKGROUND', (0,i), (-1,i), colors.salmon)
        elif item['ernst'] == 'gemiddeld':
            style.add('BACKGROUND', (0,i), (-1,i), colors.yellow)
        else:
            style.add('BACKGROUND', (0,i), (-1,i), colors.lightgreen)

    table.setStyle(style)
    elements.append(table)
    elements.append(Spacer(1, 12))

    # Legenda
    legend_text = "Legenda: Hoog = rood, Gemiddeld = geel, Laag = groen"
    elements.append(Paragraph(legend_text, styles['Normal']))

    doc.build(elements)
    print(f"\nPDF-rapport gegenereerd: {pdf_file}")

# ---------- 4. Main ----------
if __name__ == "__main__":
    json_file = "font_mismatches.json"  # Output from your font check script
    data = load_mismatches_from_json(json_file)

    generate_console_report(data)
    generate_pdf_report(data, pdf_file="rapport_fontcheck.pdf", logo_path="laméco_logo.png")


Rapport van afwijkingen:
Element        Type        Verwacht    Gevonden    PaginaErnst
[92mParagraaf 13   lettertype  Montserrat  Arial       1     laag[0m

PDF-rapport gegenereerd: rapport_fontcheck.pdf
