# Init

In [1]:
import fitz
import lorem
import json
import random
import re
import os

fitz.TOOLS.set_small_glyph_heights(True)
fitz.TOOLS.set_graphics_min_line_width(True)

UNUSUAL_CHARS = ["-","~","*"]
FONT_TYPE = fitz.Font(ordering=0)
FONT_SIZE = 12
COLOR_BLACK = (0, 0, 0)
COLOR_WHITE = (1, 1, 1)
TEST_DOCUMENT_DIR = "../test-document-v3"

# Illusivicate PDF

In [2]:
from functools import reduce


def illusificate_entity(bbox, origin, text, page: fitz.Page, tw: fitz.TextWriter, curr_page: int):
    
    tw.append(origin, text, fontsize=FONT_SIZE, font=FONT_TYPE)
    page.add_redact_annot(bbox)

    shape=page.new_shape()
    shape.draw_rect(bbox)
    shape.finish(color=(1,0,1))
    shape.commit()

def _get_chars(text_blocks):
    lines = [b['lines'] for b in text_blocks]
    lines = reduce(lambda a,b: a+b, lines)
    spans = [l['spans'] for l in lines]
    spans = reduce(lambda a,b: a+b, spans)
    chars = [s['chars'] for s in spans]
    chars = reduce(lambda a,b: a+b, chars)
    return chars

def illusificate_doc(doc: fitz.Document):
    """
        res schema:
        [
            {
                "page": 0,
                "origin": (4.2, 4.2),
                "text": "~",
                "label": "ILLUSIVE"
            }
        ]
    """
    res = []
    for curr_page, page in enumerate(doc):
        page: fitz.Page = page
        tw = fitz.TextWriter(page.rect, color=COLOR_WHITE)
        textpage = page.get_textpage()
        chars = _get_chars(textpage.extractRAWDICT()["blocks"])
        for char in chars:
            # print(clip)
            bbox = char["bbox"]
            origin = char["origin"]
            text = char["c"]
            label = "NON-ILLUSIVE"

            if char["c"] in UNUSUAL_CHARS:
                illusificate_entity(bbox, origin, text, page, tw, curr_page)
                label = "ILLUSIVE"

            res.append({
                "page": curr_page,
                "origin": (origin[0], origin[1]),
                "text": text,
                "label": label,
            })
            
        #     clips = page.search_for(char)            
        #     for clip in clips:
        #         res.append(illusificate_entity(page, clip, tw, curr_page))
            
        # print(clip)

        page.apply_redactions(images=0)
        tw.write_text(page)
        return res

In [3]:
def generate_illusive_document(filename: str, doc_text: str):

    doc = fitz.open()
    page = doc.new_page()

    page_padding = 72
    page_rect = page.rect
    writer = fitz.TextWriter(page_rect, color=COLOR_BLACK)

    fill_rect = fitz.Rect(
        page_padding,
        page_padding,
        page_rect.width - page_padding,
        page_rect.height - page_padding,
    )

    writer.fill_textbox(
        fill_rect,
        doc_text,
        align=fitz.TEXT_ALIGN_LEFT,
        warn=True,
        fontsize=FONT_SIZE,
        font=FONT_TYPE,
    )
    writer.write_text(page, overlay=0)
    
    results = illusificate_doc(doc)
    
    json_object = json.dumps(results, indent=2)
    with open(f"{filename}.json", "w") as outfile:
        outfile.write(json_object)

    doc.save(f"{filename}.pdf")

# Random Text Generation

In [4]:
def generate_document_text(num_paragraph: int = 4):
    doc_text = [lorem.paragraph() for i in range(0, num_paragraph)]
    return '\n\n'.join(doc_text)

# Generate Illusive Document

In [5]:
def random_unusual_char(match):
    return random.choice(UNUSUAL_CHARS)

def generate_illusive_docs(doc_amount: int):
    assert(doc_amount >= 1)
    os.makedirs(TEST_DOCUMENT_DIR, exist_ok=True)
    for i in range(1, doc_amount + 1):
        doc_text = generate_document_text()
        doc_text = re.sub(r"[^\S\r\n]", random_unusual_char, doc_text)
        generate_illusive_document(f"{TEST_DOCUMENT_DIR}/TEST{i}", doc_text)

# Main Process

In [6]:
generate_illusive_docs(100)