# Data Minimisation Experiment

Code for the dataminimisation experiment to compare different blurring scenarios.

## Preparation

To prepare the data for the experiment, download from the landingzone the detections and images in the following structure:

- `input_folder /`
  - `images`
  - `detections`

Run this notebook to generate the images for the different scenarios which will be saved in `output_folder`. 

In [None]:
import os
import pandas as pd

from objectherkenning_openbare_ruimte.data_minimisation.data_minimisation import DataMinimisation

# Set appropriate paths
input_folder = "../datasets/oor/data-minimisation/prep_data"
images_folder = os.path.join(input_folder, "images")
detections_folder =  os.path.join(input_folder, "detections")

output_folder = "../datasets/oor/data-minimisation/blur_test"

# Folder to store detections in YOLO format
annotations_folder = os.path.join(input_folder, "labels")

In [None]:
# Convert detections from "landingzone format" back to YOLO annotation files

df = pd.concat([pd.read_csv(os.path.join(detections_folder, file)) for file in os.listdir(detections_folder) if file.endswith(".csv")])
images = [file for file in os.listdir(images_folder) if file.endswith(".jpg")]
df = df[df["image_name"].isin(images)]

os.makedirs(annotations_folder, exist_ok=True)

for image in images:
    out_file = os.path.join(annotations_folder, f"{os.path.splitext(image)[0]}.txt")
    image_df = df[df["image_name"] == image].set_index("image_name")
    image_df.to_csv(out_file, sep=" ", index=False, header=False)

In [None]:
# Generate images for scenarios

data_minimisation = DataMinimisation()

data_minimisation.process_folder(images_folder, annotations_folder, output_folder, image_format="jpg")

## Analysis

Analysis of the landing zone images based on the different scenarios and the annotated false negatives.

### Scenario A

In [None]:
import json
import os

import pandas as pd

from cvtoolkit.converters.azure_coco_to_coco_converter import AzureCocoToCocoConverter

from objectherkenning_openbare_ruimte.performance_evaluation_pipeline.metrics.metrics_utils import ObjectClass
from objectherkenning_openbare_ruimte.performance_evaluation_pipeline.metrics.per_pixel_stats import PerPixelEvaluator
from objectherkenning_openbare_ruimte.performance_evaluation_pipeline.source.oor_evaluation import tba_result_to_df
from objectherkenning_openbare_ruimte.performance_evaluation_pipeline.source.yolo_to_coco import convert_yolo_predictions_to_coco_json


# Experiment folder
exp_dir = "../datasets/oor/data-minimisation/exp_241118"

# Folder with annotations from on-edge blurring (YOLO format)
blurring_annotations_dir = os.path.join(exp_dir, "blurring_labels_241118")

# JSON file with false negatives from Azure Data Labelling project
fn_annotations_file = os.path.join(exp_dir, "dataminimisation_fn.json")

image_shape = (1280, 720)

signals = [
    "26-D18M11Y2024-H12M35S49-03303",
    "23-D18M11Y2024-H13M50S52-02875",
    "23-D18M11Y2024-H13M50S52-02904",
    "56-D18M11Y2024-H10M48S43-06929",
    "40-D18M11Y2024-H10M32S22-04948",
    "20-D18M11Y2024-H13M47S49-02501",
    "7-D18M11Y2024-H09M58S38-00978",
    "4-D18M11Y2024-H12M13S18-00601",
    "13-D18M11Y2024-H14M56S50-01636",
    "9-D18M11Y2024-H14M52S45-01129",
]

signals_only = False

n_images = len(signals) if signals_only else 462


# Convert YOLO blurring annotations to COCO json
coco_blurred_file = os.path.join(exp_dir, "coco_blurred.json")

out_file = convert_yolo_predictions_to_coco_json(
    predictions_dir=blurring_annotations_dir,
    image_shape=image_shape,
    labels_rel_path="",
    splits=None,
    output_dir=exp_dir
)[0]
os.rename(out_file, coco_blurred_file)

# Convert AML annotations to COCO json
coco_fn_file = os.path.join(exp_dir, "coco_fn.json")
converter = AzureCocoToCocoConverter(
    azureml_file=fn_annotations_file,
    output_file=coco_fn_file,
    new_width=image_shape[0],
    new_height=image_shape[1]
)
converter.convert()

# Merge blurring annotations and fn annotations as proxy for ground truth
with open(coco_blurred_file, 'r') as f:
    coco_blurred_json_content = json.load(f)
with open(coco_fn_file, 'r') as f:
    coco_fn_json_content = json.load(f)

coco_fn_json_content["annotations"] = [ann for ann in coco_fn_json_content["annotations"] if ann["category_id"] != 2]

if signals_only:
    coco_blurred_json_content = [ann for ann in coco_blurred_json_content if ann["image_id"] in signals]
    coco_fn_json_content["annotations"] = [ann for ann in coco_fn_json_content["annotations"] if ann["image_id"] in signals]

    with open(coco_blurred_file, "w") as f:
        json.dump(coco_blurred_json_content, f)

with open(coco_fn_file, "w") as f:
    json.dump(coco_fn_json_content, f)

coco_all_json_content = [*coco_blurred_json_content, *coco_fn_json_content["annotations"]]

# Remove score if it exists (to prevent issues later on, since AML annotations don't have a score)
for annotation in coco_all_json_content:
    annotation.pop("score", 0.0)

coco_all_file = os.path.join(exp_dir, "coco_all.json")

with open(coco_all_file, "w") as f:
    json.dump(coco_all_json_content, f)

del coco_all_json_content, coco_blurred_json_content

# Evaluate results
evaluator = PerPixelEvaluator(
    ground_truth_path=coco_all_file,
    predictions_path=coco_blurred_file,
    image_shape=image_shape,
)
result = {
    "OOR-v2.2_all": evaluator.collect_results_per_class_and_size(
        classes=[ObjectClass.person, ObjectClass.license_plate],
        single_size_only=False
    )
}

result_df = tba_result_to_df(results=result)
result_df

In [None]:
total_pixels = image_shape[0] * image_shape[1] * n_images
print(f"Number of images: {n_images}")
print(f"Total number of pixels: {total_pixels}")

percent_missed = (result_df["False Negatives"] / total_pixels) * 100

print("PERSON - percentage of pixels unblurred: "
      f"{percent_missed.loc[0]:.3f}% ({int(result_df.loc[0, "False Negatives"] / n_images)} pixels per image)")
print("LICENSE PLATE - percentage of pixels unblurred: "
      f"{percent_missed.loc[4]:.3f}% ({int(result_df.loc[4, "False Negatives"] / n_images)} pixels per image)")

### Scenario B & C

The statistics for scenarios B and C are the same, since the only difference is whether we blur the rest or crop the rest. The number of unblurred pixels is the same.

In [None]:
import numpy as np

def yolo_box_to_coco_box(yolo_box, img_shape):
    xcn, ycn, wn, hn = yolo_box
    xc, w = xcn * img_shape[0], wn * img_shape[0]
    yc, h = ycn * img_shape[1], hn * img_shape[1]
    x = xc - w / 2
    y = yc - h / 2
    return [x, y, w, h]

def box_to_mask(mask, bbox, box_padding=25):
    x_min, y_min, w, h = map(int, bbox)
    x_max = x_min + w
    y_max = y_min + h

    x_min = max(0, x_min - box_padding)
    y_min = max(0, y_min - box_padding)
    x_max = min(mask.shape[1], x_max + box_padding)
    y_max = min(mask.shape[0], y_max + box_padding)

    mask[y_min:y_max, x_min:x_max] = 1
    
    return mask

# Get container detections from detection_metadata in "landingzone format"
detections_folder = os.path.join(exp_dir, "detection_metadata_241118")
detection_df = pd.concat([pd.read_csv(os.path.join(detections_folder, file)) for file in os.listdir(detections_folder) if file.endswith(".csv")])
detection_df["image_name"] = detection_df["image_name"].str.replace(".jpg", "")

# Padding around containers when blurring / cropping. Default: 25
box_padding = 25

target_classes = [0, 1]

scenario_results = {
    "image": [],
    "fn_visible": {
        0: [],
        1: [],
    }
}

if signals_only:
    image_list = signals
else:
    image_list = detection_df["image_name"].unique()

for image in image_list:
    scenario_results["image"].append(image)

    detection_mask = np.zeros(shape=image_shape[::-1], dtype="bool")
    
    detections = detection_df[detection_df["image_name"] == f"{image}"]
    for idx, row in detections.iterrows():
        bbox = yolo_box_to_coco_box(row.loc[["x_center", "y_center", "width", "height"]].to_list(), image_shape)
        detection_mask = box_to_mask(detection_mask, bbox, box_padding=box_padding)

    fns = [ann for ann in coco_fn_json_content["annotations"] if ann["image_id"] == image]

    for target in target_classes:
        fn_mask = np.zeros(shape=image_shape[::-1], dtype="bool")
        for fn in fns:
            if fn["category_id"] == target:
                fn_mask = box_to_mask(fn_mask, fn["bbox"], box_padding=0)
        scenario_results["fn_visible"][target].append(np.count_nonzero(np.logical_and(detection_mask, fn_mask)))
px_vis_person = sum(scenario_results["fn_visible"][0])
px_vis_license = sum(scenario_results["fn_visible"][1])

print(f"Padding: {box_padding}px")
print(f"PERSON - pixels visible: {px_vis_person} ({int(px_vis_person / n_images)} per image)")
print(f"LICENSE PLATE - pixels visible: {px_vis_license} ({int(px_vis_license / n_images)} per image)")

In [None]:
scenario_results

In [None]:
[ann for ann in coco_fn_json_content["annotations"] if ann["image_id"] == "26-D18M11Y2024-H12M35S49-03303"]

## Backup

In [None]:
import pathlib

annotation_files = list(pathlib.Path(blurring_annotations_dir).glob("*.txt"))

counts = {
    "0": 0,
    "1": 0,
}

sizes = {
    "0": [],
    "1": [],
}

for file in annotation_files:
    with open(file, 'r') as f:
        for annotation in f.readlines():
            cls_id, x, y, w, h = annotation.split(sep=" ")[0:5]
            counts[cls_id] += 1
            sizes[cls_id].append(float(w) * float(h))

In [None]:
counts

In [None]:
with open(fn_annotations_file, 'r') as f:
    json_content = json.load(f)

In [None]:
counts_unblurred = {
    "0": 0,
    "1": 0,
    "2": 0,
}

sizes_unblurred = {
    "0": [],
    "1": [],
}

for ann in json_content["annotations"]:
    counts_unblurred[str(ann["category_id"] - 1)] += 1
    if ann["category_id"] != 3:
        bbox = ann["bbox"]
        sizes_unblurred[str(ann["category_id"] - 1)].append(bbox[2] * bbox[3])

In [None]:
counts_unblurred

In [None]:
import matplotlib.pyplot as plt
import numpy as np

bins = np.linspace(0, 0.015, 150)

plt.hist(sizes["1"], bins, alpha=0.5, label='Blurred')
plt.hist(sizes_unblurred["1"], bins, alpha=0.5, label='Unblurred')
plt.legend(loc='upper right')
plt.show()

In [None]:
bins = np.linspace(0, 0.05, 150)

plt.hist(sizes["0"], bins, alpha=0.5, label='Blurred')
plt.hist(sizes_unblurred["0"], bins, alpha=0.5, label='Unblurred')
plt.legend(loc='upper right')
plt.show()