In [1]:
import json
from tqdm import tqdm
from PIL import Image
from src.process import create_image_view

import warnings
warnings.filterwarnings("ignore")

from src.AnonymizationInference import AnonymizationInference
import os
from src.evaluate import count_all_layoutlm_metrics, count_std
from src.config import funsd_label_list


def get_layoutlm_predictions(
            inference, images, path_to_image, path_to_gt_labeled_images, labels_saving_path, image_views_path, path_to_gt_labels=None,
    ):
        for image in tqdm(images):

            if not image.endswith(".png"):
                continue

            image_path = os.path.join(path_to_image, image)

            if path_to_gt_labels:
                with open(os.path.join(path_to_gt_labels, image.replace(".png", ".json"))) as f:
                    gt_labels = json.load(f)
                words = gt_labels["tokens"]
                boxes = gt_labels["bboxes"]
                
                predictions = inference.predict(image_path, words, boxes)
            else:
                predictions = inference.predict(image_path)
            
            img_with_pred_bboxes = inference.draw_bboxes(image_path, predictions)
            img_with_gt_bboxes = Image.open(os.path.join(path_to_gt_labeled_images, image))
            create_image_view(img_with_gt_bboxes, img_with_pred_bboxes, f"{image_views_path}/{image}")
            label_name = image.replace(".png", ".json")
            with open(f"{labels_saving_path}/{label_name}", "w") as f:
                json.dump(predictions, f, indent=4)

## Getting predictions

In [12]:
# weights of LayoutLM fine-tuned on SAND
layoutlm_model_name = "layoutlm_best"
# weights of LayoutLM fine-tuned on FATURA-PII-Train (80 docs)
fatura_raw_model_name = "fatura_raw"

# models
detection_model="fast_base"
recognition_model="master"
ocr_model = f"{detection_model}_{recognition_model}"
layoutlm_model = "best_finetuned"
lm_model_weights = f"weights/{layoutlm_model_name}"
fatura_model_weights = f"weights/{fatura_raw_model_name}"
signature_model_weights = "weights/yolo_signatures.pt"

# paths
path_to_results = "results"
path_to_benchmark = "data/funsd_benchmark"
path_to_fatura = "data/fatura"

# benchmark
path_to_benchmark_images = os.path.join(path_to_benchmark, "images")
path_to_gt_benchmark_labeled_images = os.path.join(path_to_benchmark, "labeled_images")
path_to_gt_benchmark_labels = os.path.join(path_to_benchmark, "layoutlm_labels")
predicted_benchmark_image_view_path = os.path.join(
    path_to_results, f"benchmark_image_views_{layoutlm_model_name}"
)
predicted_benchmark_labels_folder = os.path.join(
    path_to_results, f"benchmark_layoutlm_labels_{layoutlm_model_name}"
)
predicted_benchmark_image_view_path_fatura_raw = os.path.join(
    path_to_results, f"benchmark_image_views_{fatura_raw_model_name}"
)
predicted_benchmark_labels_folder_fatura_raw = os.path.join(
    path_to_results, f"benchmark_layoutlm_labels_{fatura_raw_model_name}"
)
benchmark_images = os.listdir(path_to_benchmark_images)

# fatura
path_to_fatura_images =  os.path.join(path_to_fatura, "test_images")
path_to_fatura_benchmark_labeled_images = os.path.join(path_to_fatura, "test_labeled_images")
path_to_gt_fatura_labels = os.path.join(path_to_fatura, "test_layoutlm_labels")
predicted_fatura_image_view_path = os.path.join(path_to_results, f"fatura_image_views_{fatura_raw_model_name}")
predicted_fatura_labels_folder = os.path.join(
    path_to_results, f"fatura_layoutlm_labels_{fatura_raw_model_name}"
)
fatura_images = os.listdir(path_to_fatura_images)


os.makedirs(path_to_results, exist_ok=True)
os.makedirs(predicted_benchmark_image_view_path, exist_ok=True)
os.makedirs(predicted_benchmark_labels_folder, exist_ok=True)

os.makedirs(predicted_benchmark_image_view_path_fatura_raw, exist_ok=True)
os.makedirs(predicted_benchmark_labels_folder_fatura_raw, exist_ok=True)

os.makedirs(predicted_fatura_image_view_path, exist_ok=True)
os.makedirs(predicted_fatura_labels_folder, exist_ok=True)

In [3]:
# LayoutLM fine-tuned on SAND
sand_inference = AnonymizationInference(
    detection_model=detection_model,
    recognition_model=recognition_model,
    path_to_layoutlm_weights=lm_model_weights,
    path_to_signature_weights=signature_model_weights,
    label_list=funsd_label_list,
)

# Benchmark
get_layoutlm_predictions(
    inference=sand_inference,
    images=benchmark_images, 
    path_to_image=path_to_benchmark_images, 
    path_to_gt_labeled_images=path_to_gt_benchmark_labeled_images, 
    labels_saving_path=predicted_benchmark_labels_folder, 
    image_views_path=predicted_benchmark_image_view_path,
    path_to_gt_labels=path_to_gt_benchmark_labels,
)

100%|██████████| 255/255 [01:42<00:00,  2.49it/s]


In [4]:
# LayoutLM fine-tuned solely on FATURA-PII
fatura_raw_inference = AnonymizationInference(
    detection_model=detection_model,
    recognition_model=recognition_model,
    path_to_layoutlm_weights=fatura_model_weights,
    path_to_signature_weights=signature_model_weights,
    label_list=funsd_label_list,
)

# Benchmark
get_layoutlm_predictions(
    inference=fatura_raw_inference,
    images=benchmark_images, 
    path_to_image=path_to_benchmark_images, 
    path_to_gt_labeled_images=path_to_gt_benchmark_labeled_images, 
    labels_saving_path=predicted_benchmark_labels_folder_fatura_raw, 
    image_views_path=predicted_benchmark_image_view_path_fatura_raw,
    path_to_gt_labels=path_to_gt_benchmark_labels,
)

# FATURA-PII test
get_layoutlm_predictions(
    inference=fatura_raw_inference,
    images=fatura_images, 
    path_to_image=path_to_fatura_images, 
    path_to_gt_labeled_images=path_to_fatura_benchmark_labeled_images, 
    labels_saving_path=predicted_fatura_labels_folder, 
    image_views_path=predicted_fatura_image_view_path,
    path_to_gt_labels=path_to_gt_fatura_labels,
)

100%|██████████| 255/255 [01:51<00:00,  2.29it/s]
100%|██████████| 19/19 [00:05<00:00,  3.50it/s]


### LayoutLM Fine-tuned on SAND Benchmark Metrics

In [13]:
class_names = ["full_name", "phone_number", "address", "company_name", "email_address"]
sand_metrics_per_documents, _, _ = count_all_layoutlm_metrics(
    path_to_gt_benchmark_labels, predicted_benchmark_labels_folder, class_names
)
sand_metrics_per_documents

{'full_name': defaultdict(float,
             {'recall': 0.9664051624813856, 'precision': 0.8909125372042562}),
 'phone_number': defaultdict(float,
             {'recall': 0.9515424515424513, 'precision': 0.8473034246691538}),
 'address': defaultdict(float,
             {'recall': 0.9074883776238016, 'precision': 0.897116295060735}),
 'company_name': defaultdict(float,
             {'recall': 0.7661820906490023, 'precision': 0.7224063803897763}),
 'email_address': defaultdict(float,
             {'recall': 0.9031746031746032, 'precision': 0.8958333333333334})}

### LayoutLM Fine-tuned solely on FATURA-PII-Train (80 docs) Benchmark and FATURA-PII-Test Metrics

In [8]:
fatura_metrics_per_documents_benchmark, _, _ = count_all_layoutlm_metrics(
    path_to_gt_benchmark_labels, predicted_benchmark_labels_folder_fatura_raw, class_names
)
fatura_metrics_per_documents_benchmark

{'full_name': defaultdict(float,
             {'recall': 0.03384278260834329, 'precision': 0.1935358255451713}),
 'phone_number': defaultdict(float,
             {'recall': 0.286548849880576, 'precision': 0.4482274482274482}),
 'address': defaultdict(float,
             {'recall': 0.8520869887298631, 'precision': 0.8778289473027707}),
 'company_name': defaultdict(float,
             {'recall': 0.3960862396289525, 'precision': 0.3453234217050008}),
 'email_address': defaultdict(float,
             {'recall': 0.5854166666666667, 'precision': 0.468923611111111})}

In [10]:
fatura_metrics_per_documents_test, _, _ = count_all_layoutlm_metrics(
    path_to_gt_fatura_labels, predicted_fatura_labels_folder, class_names
)
fatura_metrics_per_documents_test

{'full_name': defaultdict(float, {'recall': 0.9375, 'precision': 0.9375}),
 'phone_number': defaultdict(float, {'recall': 1.0, 'precision': 0.9875}),
 'address': defaultdict(float, {'recall': 1.0, 'precision': 1.0}),
 'company_name': defaultdict(float,
             {'recall': 1.0, 'precision': 0.9897959183673469}),
 'email_address': defaultdict(float,
             {'recall': 1.0, 'precision': 0.9583333333333334})}

### STD variance of metrics (Experiment 1 and Experiment 2)

In [4]:
# Trained on SAND
print("STD metrics for SAND")
count_std([f"data/std_metrics/sand_metrics{i}.csv" for i in range(1, 6)])
print()
print("STD metrics for FATURA-PII")
# Trained on FATURA-PII
count_std([f"data/std_metrics/fatura_metrics{i}.csv" for i in range(1, 6)])

STD metrics for SAND
           class  precision_std  recall_std
0      full_name       3.880410    1.339171
1   phone_number       2.247515    1.148931
2        address       2.804857    4.603748
3   company_name       3.672424    5.042016
4  email_address       7.797683    3.452163

STD metrics for FATURA-PII
           class  precision_std  recall_std
0      full_name      25.661592   11.381131
1   phone_number      17.248104   21.166772
2        address       4.465766    6.958208
3   company_name       8.069297   22.456856
4  email_address       8.845300    7.807570
