In [1]:
from docx import Document

# Characters to clean
INVISIBLE_UNICODE = [
    '\u200b', '\u200c', '\u200d', '\ufeff', '\u2060', 
    '\u00a0', '\u202c', '\u202d', '\u202e'
]

def clean_text(text):
    for char in INVISIBLE_UNICODE:
        text = text.replace(char, '')
    return text

def show_unicode_codes(text):
    return ''.join(
        f"[{ord(c):04x}]" if c in INVISIBLE_UNICODE else c
        for c in text
    )

def clean_document(input_path, output_path):
    doc = Document(input_path)

    # Clean paragraph text
    for para in doc.paragraphs:
        if any(c in para.text for c in INVISIBLE_UNICODE):
            print("Before:", show_unicode_codes(para.text))
        cleaned = clean_text(para.text)
        para.text = cleaned
        if cleaned != para.text:
            print("After :", cleaned)
            print("-" * 30)

    # Clean tables
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                cell.text = clean_text(cell.text)

    # Attempt to clean headers/footers (for watermark removal)
    for section in doc.sections:
        header = section.header
        for para in header.paragraphs:
            para.text = clean_text(para.text)
        footer = section.footer
        for para in footer.paragraphs:
            para.text = clean_text(para.text)

    doc.save(output_path)

# Run on your file
clean_document('test_special_characters.docx', 'r_clean.docx')

Before: Zero-width space: before[200b]after
Before: Zero-width non-joiner: before[200c]after
Before: Zero-width joiner: before[200d]after
Before: BOM (Byte Order Mark): before[feff]after
Before: Word joiner: before[2060]after
Before: Non-breaking space: before[00a0]after
Before: Left-to-right override: before[202d]after
Before: Right-to-left override: before[202e]after
