In [11]:
import lorem
from docx import Document

In [12]:
def generate_text(num_paragraph=4):
    return [lorem.paragraph() for i in range(0, num_paragraph)]

In [13]:
import os
from random import randint
from docx.shared import RGBColor
from docx2pdf import convert

MAX_PARAGRAPH_IN_PAGE = 7
COLOR_WHITE = RGBColor(0xFF, 0xFF, 0xFF)
COLOR_BLACK = RGBColor(0x00, 0x00, 0x00)

LABEL_ILLUSIVE = "ILLUSIVE"
LABEL_NON_ILLUSIVE = "NON-ILLUSIVE"

def generate_docx():
    """ return followed schema
        {
            total_illusive: 100,
            total_non_illusive: 100,
            total_characters: 200
            chars_data: [
                {
                    char: 'a',
                    label: 'ILLUSIVE'
                },
                {
                    char: 'b',
                    label: 'NON-ILLUSIVE'
                },
                {
                    char: 'c',
                    label: 'ILLUSIVE'
                },
                ...
            ]
        }
    """
    
    num_paragraph = randint(1, 20)
    doc_text = generate_text(num_paragraph)
    doc_text = [doc_text[i:i + MAX_PARAGRAPH_IN_PAGE] for i in range(0, len(doc_text), MAX_PARAGRAPH_IN_PAGE)]

    total_page = len(doc_text)
    total_illusive = 0
    total_non_illusive = 0

    doc = Document()
    chars_info = []
    for idx, page_text in enumerate(doc_text):
        for paragraph_text in page_text:
            p = doc.add_paragraph()

            for char_text in paragraph_text:
                is_illusive = 0 if char_text == " " else randint(0, 1)
                if is_illusive:
                    total_illusive += 1
                    color = COLOR_WHITE
                    label = LABEL_ILLUSIVE
                else:
                    total_non_illusive += 1
                    color = COLOR_BLACK
                    label = LABEL_NON_ILLUSIVE

                p.add_run(char_text).font.color.rgb = color
                chars_info.append({
                    "char": char_text,
                    "label": label
                })

            # LINEBREAK
            chars_info.append({
                    "char": " ",
                    "label": LABEL_NON_ILLUSIVE
            })
            total_non_illusive += 1

            
        if(idx != total_page - 1):
            doc.add_page_break()
            chars_info.append({
                "char": " ",
                "label": LABEL_NON_ILLUSIVE
            })
            chars_info.append({
                "char": " ",
                "label": LABEL_NON_ILLUSIVE
            })
            total_non_illusive += 2

    res = {
        "total_illusive": total_illusive,
        "total_non_illusive": total_non_illusive,
        "total_characters": total_illusive + total_non_illusive,
        "chars_data": chars_info
    }
    
    return doc, res

In [14]:
import json


DOCX_DIR = "../generated-docx"
TEST_DIR = "../test-data"

def generate_test_data(amount=1):
    os.makedirs(DOCX_DIR, exist_ok=True)
    os.makedirs(TEST_DIR, exist_ok=True)

    for i in range(1, amount+1):
        filename = f"TEST_{i}"

        docx_loc = f"{DOCX_DIR}/{filename}.docx"
        json_loc = f"{TEST_DIR}/{filename}.json"
        pdf_loc = f"{TEST_DIR}/{filename}.pdf"
        
        doc, res = generate_docx()
        doc.save(docx_loc)
        convert(docx_loc, pdf_loc)

        json_object = json.dumps(res, indent=2)
        with open(json_loc, "w") as outfile:
            outfile.write(json_object)


In [15]:
generate_test_data(10)

100%|██████████| 1/1 [00:02<00:00,  2.81s/it]
100%|██████████| 1/1 [00:02<00:00,  2.99s/it]
100%|██████████| 1/1 [00:03<00:00,  3.46s/it]
100%|██████████| 1/1 [00:03<00:00,  3.02s/it]
100%|██████████| 1/1 [00:02<00:00,  2.34s/it]
100%|██████████| 1/1 [00:03<00:00,  3.27s/it]
100%|██████████| 1/1 [00:02<00:00,  2.66s/it]
100%|██████████| 1/1 [00:02<00:00,  2.91s/it]
100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
100%|██████████| 1/1 [00:03<00:00,  3.41s/it]
