In [1]:
import os
TEST_DIR = "../test-data"

In [2]:
def generate_test_schema():
    """
    [
        {
            'document_file': 'TEST1.pdf',
            'json_file': 'TEST1.json'
        }
    ]
    """
    test_schema = []
    for root, dirs, files in os.walk(TEST_DIR):

        document_files = [f for f in files if f.endswith('.pdf')]
        actual_files = [f for f in files if f.endswith('.json')]
        assert(len(document_files) == len(actual_files))

        test_schema = [
            {
                "document_file": document_files[i],
                "json_file": actual_files[i]
            } for i in range(0, len(document_files))
        ]
    return test_schema

In [3]:
import json
from charset_normalizer import detect
from illusive_text_detector import IllusiveTextDetector
import sklearn.metrics as metrics

def test_detector():
    detector = IllusiveTextDetector()
    test_schema = generate_test_schema()

    for test in test_schema:
        test_pdf_loc = f"{TEST_DIR}/{test['document_file']}"
        test_json_loc = f"{TEST_DIR}/{test['json_file']}"

        test_json_file = open(test_json_loc)
        actual = json.load(test_json_file)
        test_json_file.close()

        predicted = detector.detect(test_pdf_loc)
        assert(actual["total_characters"] == predicted["total_characters"])

        actual_labels = []
        pred_labels = []
        for i in range(0, actual["total_characters"]):
            actual_data = actual["chars_data"][i]
            pred_data = predicted["chars_data"][i]
            assert(actual_data["char"] == pred_data["char"])
            
            actual_labels.append(actual_data["label"])
            pred_labels.append(pred_data["label"])

        m = metrics.confusion_matrix(actual_labels, pred_labels)
        acc = metrics.accuracy_score(actual_labels, pred_labels)
        print(m, acc)
        # doc: fitz.Document = fitz.open(test_pdf_loc)
        # print("----")
        # print("---")
        # for p in predicted["characters_info"]:
        #     print(p['char'], end="")
        # print("\n***")
        # for a in actual["characters_info"]:
        #     print(a['char'], end="")
        # print("\n---")
        # print(predicted["characters_info"].__len__(), actual["characters_info"].__len__())
        # print("----")

test_detector()

[[1604    0]
 [   0 2158]] 1.0
[[2058    0]
 [   0 2852]] 1.0
[[1714    0]
 [   0 2262]] 1.0
[[2172    0]
 [   0 2984]] 1.0
[[1241    0]
 [   0 1673]] 1.0
[[135   0]
 [  0 181]] 1.0
[[2798    0]
 [   0 3765]] 1.0
[[2323    0]
 [   0 3104]] 1.0
[[ 969    0]
 [   0 1300]] 1.0
[[2044    0]
 [   0 2752]] 1.0
