In [27]:
import os
from tqdm import tqdm
from PIL import Image
import json
from src.config import pii_entities_colors_rgba
from PIL import ImageDraw, ImageFont
from presidio_image_redactor import ImageRedactorEngine


def convert_presidio_to_layoutlm(presidio_bboxes):
    tokens = []
    ner_tags = []
    bboxes = []
    
    entity_dict = {
        "PERSON": "full_name",
        "LOCATION": "address",
        "EMAIL_ADDRESS": "email_address",
        "CREDIT_CARD": "payment_information",
        "IBAN_CODE": "payment_information",
        "PHONE_NUMBER": "phone_number",
    }
    
    for box in presidio_bboxes:
        x0 = box.left
        y0 = box.top
        x1 = x0 + box.width
        y1 = y0 + box.height
        entity_type = box.entity_type
        if entity_type not in entity_dict:
            continue
        tokens.append("-")
        ner_tag = "B-" + entity_dict[entity_type]
        ner_tags.append(ner_tag)
        bboxes.append([x0, y0, x1, y1])
        
    return tokens, ner_tags, bboxes

def get_predictions_for_image(image_path):
    image = Image.open(image_path).convert("RGB")
    redacted_image, bboxes = image_redactor.redact(image)
    tokens, ner_tags, bboxes = convert_presidio_to_layoutlm(bboxes)
    return {
        "tokens": tokens,
        "bboxes": bboxes,
        "ner_tags": ner_tags,
    }

def draw_bboxes(image_path, predictions, add_text=False):
        image = Image.open(image_path).convert("RGBA")
        overlay = Image.new("RGBA", image.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(overlay)
        font = ImageFont.load_default()
        for word, box, pred in zip(predictions["tokens"], predictions["bboxes"], predictions["ner_tags"]):
            if pred == "O":
                continue
            pred = pred.split("-")[-1]
            color = pii_entities_colors_rgba.get(pred, "black")
            draw.rectangle(box, fill=color)
            if add_text:
                draw.text((box[0], box[1] - 10), f"{pred}", font=font, fill=color)
        combined = Image.alpha_composite(image, overlay)
        return combined

def anonymize_images(images_path, labels_path, labeled_images_path):
    os.makedirs(labels_path, exist_ok=True)
    os.makedirs(labeled_images_path, exist_ok=True)
    images = os.listdir(images_path)
    for image in tqdm(images):
        if not image.endswith(".png"):
            continue
        labels = get_predictions_for_image(os.path.join(images_path, image))
        labeled_image = draw_bboxes(os.path.join(images_path, image), labels)
        with open(f"{labels_path}/{image.replace('.png', '.json')}", "w") as f:
            json.dump(labels, f, indent=4)
        labeled_image.save(f"{labeled_images_path}/{image}")


image_redactor = ImageRedactorEngine()

In [28]:
benchmark_images_path = "data/funsd_benchmark/images"
benchmark_labels_path = "data/presidio_funsd_results/layoutlm_labels"
benchmark_labeled_images_path = "data/presidio_funsd_results/labeled_images"

anonymize_images(benchmark_images_path, benchmark_labels_path, benchmark_labeled_images_path)

100%|██████████| 255/255 [01:55<00:00,  2.20it/s]


In [30]:
test_samples = [
        {   
            "test_name": "benchmark",
            "gt_labels": "evaluation/funsd_benchmark/layoutlm_labels",
            "predicted_labels": benchmark_labels_path,
            "image_views": benchmark_labeled_images_path,
            "class_names": [
                "full_name", "phone_number", "address", "email_address"
            ]
        },
]

In [31]:
from src.w_b import count_and_log_all_metrics

count_and_log_all_metrics(
    samples=test_samples,
    lm_model_name=f"Presidio",
    ocr_model_name="",
    run_specification="benchmark"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33malexandraroze2000[0m ([33malexandraroze[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
