# Init

In [119]:
import fitz
import lorem
import json
import random
import re
import os

fitz.TOOLS.set_small_glyph_heights(True)

COLOR_BLACK = (0, 0, 0)
FONT_TYPE = fitz.Font(ordering=0)
FONT_SIZE = 12
TEST_DOCUMENT_DIR = "../test-document"

# Illusivicate PDF

In [120]:
def illusificate_entity_full(page: fitz.Page, clip: fitz.Rect, tw: fitz.TextWriter, curr_page: int):
    # print(clip)
    blocks = page.get_text("dict", clip=clip)["blocks"]
    span = blocks[0]["lines"][0]["spans"][0]        
    font = fitz.Font(span['font'], ordering=1)
    tw.append(span["origin"], span['text'], fontsize=12, font=font)
    page.add_redact_annot(clip)

    shape=page.new_shape()
    shape.draw_rect(clip)
    shape.finish(color=(1,0,1), fill=(1, 1, 0))
    shape.commit()
    return {
        "bbox": span["bbox"],
        "text": span["text"],
        "page": curr_page,
    }

def illusificate_entity_char_rand(page: fitz.Page, clip: fitz.Rect, tw: fitz.TextWriter, curr_page: int):
    blocks = page.get_text("rawdict", clip=clip)["blocks"]
    span = blocks[0]["lines"][0]["spans"][0]
    
    char = random.sample(span['chars'], 1)[0]
    font = fitz.Font(span['font'], ordering=1)
    tw.append(char["origin"], char["c"], fontsize=12, font=font)
    page.add_redact_annot(char["bbox"])

    shape=page.new_shape()
    shape.draw_rect(char["bbox"])
    shape.finish(color=(1,0,1), fill=(1, 1, 0))
    shape.commit()
    return {
        "bbox": char["bbox"],
        "text": char["c"],
        "page": curr_page,
    }

def illusificate_doc(doc: fitz.Document, texts: list[str], target_char=False):
    text_queue_list = []
    res = []
    for curr_page, page in enumerate(doc):
        tw = fitz.TextWriter(page.rect, color=(1, 1, 1))
        for text in text_queue_list:
            rl = page.search_for(text)
            if(len(rl) == 0):
                continue
            clip = random.sample(rl, 1)[0]
            if(target_char):
                res.append(illusificate_entity_char_rand(page, clip, tw, curr_page))
            else:
                res.append(illusificate_entity_full(page, clip, tw, curr_page))
            text_queue_list.remove(text)
            
        for text in texts:
            rl = page.search_for(text)
            if(len(rl) == 0):
                text_queue_list.append(text)
                continue
            clip = random.sample(rl, 1)[0]
            if(target_char):
                res.append(illusificate_entity_char_rand(page, clip, tw, curr_page))
            else:
                res.append(illusificate_entity_full(page, clip, tw, curr_page))
            
        # print(clip)

        page.apply_redactions(images=0)
        tw.write_text(page)
        page.clean_contents()

        # print(text_queue_list)
        assert(len(text_queue_list) == 0)

        return res

In [121]:
def generate_illusive_document(filename: str, doc_text: str, illusives: list[str], target_char=False):

    doc = fitz.open()
    page = doc.new_page()

    page_padding = 72
    page_rect = page.rect
    writer = fitz.TextWriter(page_rect, color=COLOR_BLACK)

    fill_rect = fitz.Rect(
        page_padding,
        page_padding,
        page_rect.width - page_padding,
        page_rect.height - page_padding,
    )

    writer.fill_textbox(
        fill_rect,
        doc_text,
        align=fitz.TEXT_ALIGN_JUSTIFY,
        warn=True,
        fontsize=FONT_SIZE,
        font=FONT_TYPE,
    )
    writer.write_text(page)
    
    results = illusificate_doc(doc, illusives, target_char)
    
    json_object = json.dumps(results, indent=2)
    with open(f"{filename}.json", "w") as outfile:
        outfile.write(json_object)

    doc.save(f"{filename}.pdf")

# Random Text Generation

In [122]:
def generate_document_text(num_paragraph: int = 7):
    doc_text = [lorem.paragraph() for i in range(0, num_paragraph)]
    return '\n\n'.join(doc_text)

In [132]:
def get_random_sentence(doc_text: str, amount: int):
    doc_text = re.sub(r"\n+", " ", doc_text)
    sentence_splitter_regex = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    text_set = set(re.split(sentence_splitter_regex, doc_text))
    assert(len(text_set) > amount)

    random_choice = random.sample(list(text_set), amount)
    print(random_choice)
    return random_choice

In [124]:
def get_random_words(doc_text: str, amount: int):
    text_set = set(re.split(r"[-;,.\s]\s*", doc_text.lower()))
    text_set.remove('')
    assert(len(text_set) > amount)
    
    random_choice = random.sample(list(text_set), amount)
    return random_choice

In [125]:
# def get_random_chars(doc_text: str, amount: int):
#     doc_text = re.sub(r"\s|\n+", "", doc_text)
#     text_set = set(doc_text)
    
#     assert(len(text_set) > amount)
    
#     random_choice = random.sample(list(text_set), amount)
#     return random_choice

# Generate Illusive Document By Sentence, Word, and Char

In [126]:
def generate_illusive_docs_sentence(doc_amount: int):
    assert(doc_amount <= 1)
    dir_name = f"{TEST_DOCUMENT_DIR}/sentence"
    os.makedirs(dir_name, exist_ok=True)
    for i in range(1, doc_amount + 1):
        doc_text = generate_document_text()
        random_sentences = get_random_sentence(doc_text, 5)
        generate_illusive_document(f"{dir_name}/TEST{i}", doc_text, random_sentences)

In [127]:
def generate_illusive_docs_word(doc_amount: int):
    assert(doc_amount <= 1)
    dir_name = f"{TEST_DOCUMENT_DIR}/word"
    os.makedirs(dir_name, exist_ok=True)
    for i in range(1, doc_amount + 1):
        doc_text = generate_document_text()
        random_words = get_random_words(doc_text, 20)
        generate_illusive_document(f"{dir_name}/TEST{i}", doc_text, random_words)

In [128]:
def generate_illusive_docs_char(doc_amount: int):
    assert(doc_amount <= 1)
    dir_name = f"{TEST_DOCUMENT_DIR}/char"
    os.makedirs(dir_name, exist_ok=True)
    for i in range(1, doc_amount + 1):
        doc_text = generate_document_text()
        random_chars = get_random_words(doc_text, 20)
        generate_illusive_document(f"{dir_name}/TEST{i}", doc_text, random_chars, True)

# Main Process

In [133]:
generate_illusive_docs_sentence(1)
generate_illusive_docs_word(1)
generate_illusive_docs_char(1)

['Magnam magnam est quisquam.', 'Consectetur quisquam aliquam est.', 'Ut voluptatem neque voluptatem dolor dolor adipisci.', 'Etincidunt non velit porro.', 'Velit magnam quaerat etincidunt magnam.']
