# Init

In [29]:
import fitz
import json
import os
import re

from functools import reduce

fitz.TOOLS.set_small_glyph_heights(True)
COLOR_WHITE = 16777215
TEST_DIR = "./doc"

In [30]:
def detect_chars(text_blocks, curr_page):
    lines = [b['lines'] for b in text_blocks]
    lines = reduce(lambda a,b: a+b, lines)
    spans = [l['spans'] for l in lines]
    spans = reduce(lambda a,b: a+b, spans)
    detected_chars = []
    for span in spans:
        chars = span["chars"]
        for char in chars:
            text = char["c"]
            if(text == " "):
                continue
            origin = char["origin"]
            label = "ILLUSIVE" if (span["color"] == COLOR_WHITE) else "NON-ILLUSIVE"
            
            detected_chars.append({
                "page": curr_page,
                "origin": list((round(origin[0], 0), round(origin[1], 0))),
                "text": text,
                "label": label,
            })
        
    return detected_chars

In [31]:
def predict_illusive_text(doc: fitz.Document):
    predict = []
    for curr_page, page in enumerate(doc):
        text_page = page.get_textpage()
        blocks = text_page.extractRAWDICT()["blocks"]
        detected_chars = detect_chars(blocks, curr_page)
        predict += detected_chars
    return predict


In [32]:
def generate_test_schema():
    """
    [
        {
            'document': 'TEST1.pdf',
            'actual': 'TEST1.json'
        }
    ]
    """
    test_schema = []
    for root, dirs, files in os.walk(TEST_DIR):

        document_files = [f for f in files if f.endswith('.pdf')]
        actual_files = [f for f in files if f.endswith('.json')]
        assert(len(document_files) == len(actual_files))

        test_schema = [
            {
                "document": document_files[i],
                "actual": actual_files[i]
            } for i in range(0, len(document_files))
        ]
    return test_schema

In [33]:
a = generate_test_schema()
a

[{'document': 'TEST1.pdf', 'actual': 'TEST1.json'},
 {'document': 'TEST2.pdf', 'actual': 'TEST2.json'},
 {'document': 'TEST3.pdf', 'actual': 'TEST3.json'}]

In [34]:
def dump_predicted(filename, doc, predicted):
    json_object = json.dumps(predicted, indent=2)
    with open(f"test_dumps/{filename}", "w") as outfile:
        outfile.write(json_object)
    
    # for page in doc:
    #     for pr in predicted:
    #         shape=page.new_shape()
    #         shape.draw_rect(pr["bbox"])
    #         shape.finish(color=(1,0,1), fill=(1, 0, 0))
    #         shape.commit()
    # doc.save(f"test_dumps/{filename}.pdf")

In [42]:
import math


def comparator(a, b):
    cond1 = math.isclose(a["origin"][0], b["origin"][0], abs_tol=2)
    cond2 = math.isclose(a["origin"][1], b["origin"][1], abs_tol=2)
    cond3 = a["text"] == b["text"]
    cond4 = a["label"] == b["label"]
        
    return cond1 and cond2 and cond3 and cond4

test_schema = generate_test_schema()
for schema in test_schema:
    print(schema)
    doc = fitz.open(f"{TEST_DIR}\{schema['document']}")
    actual = json.load(open(f"{TEST_DIR}\{schema['actual']}"))
    predicted = predict_illusive_text(doc)
    predicted.sort(key=lambda p: (p["origin"][1], p["origin"][0]))
    
    for i in range(actual.__len__()):
        assert(comparator(actual[i], predicted[i]))
    dump_predicted(schema["actual"], doc, predicted)
    # if(not(equal)):
    #     dump_predicted(idx, doc, predicted)
        

{'document': 'TEST1.pdf', 'actual': 'TEST1.json'}
{'document': 'TEST2.pdf', 'actual': 'TEST2.json'}
{'document': 'TEST3.pdf', 'actual': 'TEST3.json'}
