# Init

In [1]:
import fitz
import json
import os
import re

from functools import reduce

fitz.TOOLS.set_small_glyph_heights(True)
COLOR_WHITE = 16777215
TEST_DIR = "../test-document"
TEST_SECTION_DIR = ["char", "sentence", "word"]

In [2]:
def get_spans(text_blocks):
    lines = [b['lines'] for b in text_blocks]
    lines = reduce(lambda a,b: a+b, lines)
    spans = [l['spans'] for l in lines]
    spans = reduce(lambda a,b: a+b, spans)
    return spans

In [3]:
def predict_illusive_text(doc: fitz.Document):
    predict = []
    for curr_page, page in enumerate(doc):
        text_page = page.get_textpage()
        page_dict = text_page.extractDICT()
        blocks = [b for b in page_dict["blocks"]]
        extracted_spans = get_spans(blocks)
        
        illusive_text = [
            {
                # "bbox": list(s["bbox"]),
                "origin": list(s["origin"]),
                "text": s["text"],
                "page": curr_page,
            } for s in extracted_spans if s["color"] == COLOR_WHITE
        ]
        predict += illusive_text
    return predict


In [4]:
def generate_test_schema():
    """
    [
        {
            'location': '../test-document/word',
            'test_subject': [
                {
                    'document': 'TEST1.pdf',
                    'actual': 'TEST1.json'
                }
            ]
        }
    ]
    """
    test_schema = []
    for root, dirs, files in os.walk(TEST_DIR):
        if(len(files) == 0):
            continue
        document_files = [f for f in files if f.endswith('.pdf')]
        actual_files = [f for f in files if f.endswith('.json')]
        assert(len(document_files) == len(actual_files))

        test_schema.append({
            "location": re.sub("\\\+", "/", root),
            "test_subject": [
                {
                    "document": document_files[i],
                    "actual": actual_files[i]
                } for i in range(0, len(document_files))
            ]
        })
    return test_schema


In [5]:
def dump_predicted(filename, doc, predicted):
    json_object = json.dumps(predicted, indent=2)
    with open(f"test_dumps/{filename}.json", "w") as outfile:
        outfile.write(json_object)
    
    for page in doc:
        for pr in predicted:
            shape=page.new_shape()
            shape.draw_rect(pr["bbox"])
            shape.finish(color=(1,0,1), fill=(1, 0, 0))
            shape.commit()
    doc.save(f"test_dumps/{filename}.pdf")

In [6]:
test_schema = generate_test_schema()
for schema in test_schema:
    loc = schema["location"]
    for idx, subject in enumerate(schema["test_subject"]):
        doc = fitz.open(f"{loc}\{subject['document']}")
        actual = json.load(open(f"{loc}\{subject['actual']}"))
        predicted = predict_illusive_text(doc)
        equal = actual == predicted
        print(equal)
        if(not(equal)):
            dump_predicted(idx, doc, predicted)
        

True
True
True
