In [3]:
import zipfile
import xml.etree.ElementTree as ET

# --- CONFIG ---
file1 = "Zoekwoordenonderzoek.docx"
file2 = "Zoekwoordenonderzoek2.docx"

def get_styles(docx_file):
    styles = []
    with zipfile.ZipFile(docx_file) as docx:
        xml_content = docx.read("word/document.xml")
        tree = ET.fromstring(xml_content)
        # Word XML uses namespaces
        ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

        for r in tree.findall(".//w:r", ns):
            text = "".join([t.text for t in r.findall("w:t", ns) if t.text])
            if not text.strip():
                continue
            rpr = r.find("w:rPr", ns)
            font, size = None, None
            if rpr is not None:
                rfonts = rpr.find("w:rFonts", ns)
                if rfonts is not None:
                    font = rfonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
                rsize = rpr.find("w:sz", ns)
                if rsize is not None:
                    size = rsize.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
            styles.append((text, font, size))
    return styles

styles1 = get_styles(file1)
styles2 = get_styles(file2)

# --- Compare ---
print("Comparing font type/size differences...\n")
for i, (s1, s2) in enumerate(zip(styles1, styles2), start=1):
    text1, font1, size1 = s1
    text2, font2, size2 = s2
    if text1 != text2:
        print(f"Line {i}: Different text -> '{text1}' vs '{text2}'")
    if font1 != font2:
        print(f"Line {i}: Different font -> {font1} vs {font2}")
    if size1 != size2:
        print(f"Line {i}: Different size -> {size1} vs {size2}")

# Handle length mismatch
if len(styles1) != len(styles2):
    print(f"Warning: file1 has {len(styles1)} text runs, file2 has {len(styles2)}")


Comparing font type/size differences...

Line 5: Different font -> Montserrat vs None
Line 6: Different font -> Montserrat vs Arial
Line 7: Different font -> Montserrat vs Arial
Line 8: Different font -> Montserrat vs Arial
Line 9: Different font -> Montserrat vs Arial
Line 10: Different font -> Montserrat vs Arial
Line 11: Different font -> Montserrat vs Arial
Line 12: Different font -> Montserrat vs Arial
Line 13: Different font -> Montserrat vs Arial
Line 14: Different font -> Montserrat vs Arial
Line 15: Different font -> Montserrat vs Arial
Line 16: Different font -> Montserrat vs Arial
Line 17: Different font -> Montserrat vs Arial
Line 18: Different font -> Montserrat vs Arial
Line 19: Different font -> Montserrat vs Arial
Line 20: Different font -> Montserrat vs Arial
Line 21: Different font -> Montserrat vs Arial
Line 22: Different font -> Montserrat vs Arial
Line 23: Different font -> Montserrat vs Arial
Line 24: Different font -> Montserrat vs Arial
Line 25: Different font -

In [6]:
import zipfile
import xml.etree.ElementTree as ET
from docx import Document
from docx.enum.text import WD_COLOR_INDEX

# --- CONFIG ---
file1 = "Zoekwoordenonderzoek.docx"
file2 = "Zoekwoordenonderzoek2.docx"
output_file = "Zoekwoordenonderzoek2_highlighted.docx"

# --- Extract text + font info ---
def get_styles(docx_file):
    styles = []
    with zipfile.ZipFile(docx_file) as docx:
        xml_content = docx.read("word/document.xml")
        tree = ET.fromstring(xml_content)
        ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

        for r in tree.findall(".//w:r", ns):
            text = "".join([t.text for t in r.findall("w:t", ns) if t.text])
            if not text.strip():
                continue
            rpr = r.find("w:rPr", ns)
            font, size = None, None
            if rpr is not None:
                rfonts = rpr.find("w:rFonts", ns)
                if rfonts is not None:
                    font = rfonts.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}ascii")
                rsize = rpr.find("w:sz", ns)
                if rsize is not None:
                    size = rsize.attrib.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val")
            styles.append((text, font, size))
    return styles

# --- Extract all runs, including inside tables ---
def get_all_runs(doc):
    runs = []

    def collect_runs_from_paragraphs(paragraphs):
        for para in paragraphs:
            for run in para.runs:
                if run.text.strip():
                    runs.append(run)

    def collect_runs_from_tables(tables):
        for table in tables:
            for row in table.rows:
                for cell in row.cells:
                    collect_runs_from_paragraphs(cell.paragraphs)
                    collect_runs_from_tables(cell.tables)  # handle nested tables

    collect_runs_from_paragraphs(doc.paragraphs)
    collect_runs_from_tables(doc.tables)
    return runs

# --- Compare and highlight differences ---
def highlight_differences(file1, file2, output_file):
    styles1 = get_styles(file1)
    styles2 = get_styles(file2)

    doc2 = Document(file2)
    runs2 = get_all_runs(doc2)

    print(f"Comparing {len(styles1)} vs {len(styles2)} text runs...")

    for i, (s1, s2) in enumerate(zip(styles1, styles2)):
        if i >= len(runs2):
            break
        text1, font1, size1 = s1
        text2, font2, size2 = s2
        run = runs2[i]
        if text1 != text2 or font1 != font2 or size1 != size2:
            run.font.highlight_color = WD_COLOR_INDEX.YELLOW

    if len(styles1) != len(styles2):
        print(f"Warning: file1 has {len(styles1)} runs, file2 has {len(styles2)}")

    doc2.save(output_file)
    print(f"Saved highlighted file: {output_file}")

# --- Run ---
highlight_differences(file1, file2, output_file)


Comparing 439 vs 439 text runs...
Saved highlighted file: Zoekwoordenonderzoek2_highlighted.docx
