# Put together dataset
This notebook is used to merge the data related to paintings with the annotations. There will be created 2 versions:
- one includes all objects even though they don't have descriptions or bounding boxes
- the other includes only objects with bounding boxes and descriptions

### 0. Import libraries and data

In [None]:
import re
import json

import polars as pl
from tqdm import tqdm
from collections import Counter


ANNOTATIONS_PATH = "../../data/annotations/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/filtered_paintings/"
PROCESSED_DATA_PATH = "../../data/processed/"

In [None]:
with open(f"{ANNOTATIONS_PATH}filtered_and_refined_annotations.json") as f:
    annotations = json.load(f)

paintings_data = pl.read_json(
    f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data.json"
).to_dicts()

### 1. Create the first version of the dataset with all objects and clean their description

In [None]:
def replace_hex_with_escaped_unicode(text):
    pattern = r"U\+([0-9a-fA-F]{4})|U\+([0-9a-fA-F]{2})|\\\[x([0-9a-fA-F]{2})\\\]|\\\[\\\[x([0-9a-fA-F]{2})\\\]\\\]|00([0-9a-fA-F]{2})|\\\[([0-9a-fA-F]{4})\]|\\\[([0-9a-fA-F]{4})\\\]"

    def replacer(match):
        hex_code_str = None

        if match.group(1):
            hex_code_str = match.group(1)
        elif match.group(2):
            hex_code_str = match.group(2)
        elif match.group(3):
            hex_code_str = match.group(3)
        elif match.group(4):
            hex_code_str = match.group(4)
        elif match.group(5):
            hex_code_str = match.group(5)
        elif match.group(6):
            hex_code_str = match.group(6)
        elif match.group(7):
            hex_code_str = match.group(7)

        if hex_code_str:
            padded_hex_code = hex_code_str.zfill(4)
            return f"\\u{padded_hex_code}"

    return re.sub(pattern, replacer, text)

In [None]:
def remove_explicit_errors(text):
    text = re.sub(r"\\\[\\\[Unsupported character: output ([^\]]+?)\\\]\\\]", r"\1", text)
    text = re.sub(r"\\\[\\\[Unsupported character: output ([^\]]+?)\\\]\]", r"\1", text)
    text = re.sub(r"\\\[\\\[Unsupported character: ([^\]]+?)\\\]\\\]", r"\1", text)
    text = re.sub(r"\\\[\\\[Unsupported character: ([^\]]+?)\\\]\]", r"\1", text)
    text = re.sub(r"\\\[\\\[Invalid JSON escape sequence\\\]\]", "", text)
    text = re.sub(r"\\\[Invalid Unicode escape sequence\]", "", text)
    text = text.replace('Unsupported character: ", output ', "")

    return text

In [None]:
def remove_brackets_and_backslash(text):
    text = re.sub(r"\\\[(.*?)\\\]", r"\1", text)
    text = re.sub(r"\\\[(.*?)\]", r"\1", text)

    return text

In [None]:
def clean_and_format_text(text):
    text = re.sub(r"\\\[\\\]", " ", text)
    text = re.sub(r"\\\\", "", text)
    text = text.replace("[]", "")
    text = text.replace("\r\n\n", "")
    text = text.replace("\n\n", "")
    text = text.replace("\r\n", "")
    text = re.sub(r"\\'", "'", text)
    text = (
        text.replace("‘", "'")
        .replace("’", "'")
        .replace("`", "'")
        .replace("`", "'")
        .replace("\t", "")
        .replace("\r", "")
        .replace("\n", "")
        .replace("\\[", "")
        .replace("\\]", "")
        .replace("[", "")
        .replace("]", "")
        .replace("\\u\\u", "\\u")
        .replace("agrave", "à")
        .replace("aacute", "á")
        .replace("egrave", "è")
        .replace("eacute", "é")
        .replace("oacute", "ó")
        .replace("ograve", "ò")
        .replace("uacute", "ú")
        .replace("ugrave", "ù")
    )

    text = re.sub(r"(\d{4})(\d{4})", r"\1-\2", text)

    text = re.sub(r" +", " ", text).strip()

    if text[-2:] == "..":
        text = text[:-1]

    text = text[0].upper() + text[1:]

    if text and not re.search(r"[.!?]$", text):
        text += "."

    return text

In [None]:
for annotation_index in range(len(annotations)):
    for obj, desc in annotations[annotation_index]["objects"].items():
        if len(desc) > 0:
            cleaned_desc = clean_and_format_text(remove_brackets_and_backslash(remove_explicit_errors(replace_hex_with_escaped_unicode(desc))))
        else:
            cleaned_desc = desc

        annotations[annotation_index]["objects"][obj] = {"description": cleaned_desc, "bounding_boxes": []}

    for bbox in annotations[annotation_index]["bounding_boxes"]:
        annotations[annotation_index]["objects"][bbox[0]]["bounding_boxes"].append(
            [bbox[1], bbox[2]]
        )

    del annotations[annotation_index]["bounding_boxes"]

In [None]:
for index in tqdm(range(len(paintings_data))):
    painting_id = paintings_data[index]["id"]

    found_objects = False
    for annotation in annotations:
        if annotation["painting_id"] == painting_id:
            paintings_data[index]["objects"] = annotation["objects"]
            found_objects = True
            break

    if not found_objects:
        paintings_data[index]["objects"] = []

In [None]:
with open(f"{PROCESSED_DATA_PATH}paintings_with_all_objects.json", "w") as f:
    json.dump(paintings_data, f, indent=4)

### 2. Create the second version of the dataset with objects with bboxes and descriptions

In [None]:
filtered_paintings_data = []

for index in range(len(paintings_data)):
    if len(paintings_data[index]["objects"]) == 0:
        continue

    filtered_objects = {}

    for obj, obj_data in paintings_data[index]["objects"].items():
        if len(obj_data["description"]) != 0 and len(obj_data["bounding_boxes"]) != 0:
            filtered_objects[obj] = obj_data

    if len(filtered_objects) != 0:
        paintings_data[index]["objects"] = filtered_objects
        filtered_paintings_data.append(paintings_data[index])

#### 2.1. Remove duplicated object descriptions

In [None]:
# get the non-unique descriptions
object_descriptions = []

for painting_data in filtered_paintings_data:
    for obj, obj_data in painting_data["objects"].items():
        object_descriptions.append(obj_data["description"])

duplicates = Counter(object_descriptions)
duplicated_object_descriptions = []

for obj_description, freq in duplicates.items():
    if freq > 1:
        duplicated_object_descriptions.append(obj_description)

print(
    f"Number of duplicated descriptions that have to be removed: {len(object_descriptions) - len(set(object_descriptions))} "
)
print(
    f"Number of unique descriptions that appear more than once: {len(duplicated_object_descriptions)}"
)

In [None]:
# among duplicated descriptions, keep only the one with the largest min bbox probability
duplicated_object_descriptions_data = {}

for painting_data in filtered_paintings_data:
    for obj, obj_data in painting_data["objects"].items():
        if obj_data["description"] in duplicated_object_descriptions:
            obj_description = obj_data["description"]
            min_bbox_prob = min([bbox[0] for bbox in obj_data["bounding_boxes"]])

            if (
                obj_description in duplicated_object_descriptions_data.keys()
                and min_bbox_prob > duplicated_object_descriptions_data[obj_description][-1]
            ) or (obj_description not in duplicated_object_descriptions_data.keys()):
                duplicated_object_descriptions_data[obj_description] = [
                    painting_data["id"],
                    obj,
                    min_bbox_prob,
                ]

In [None]:
filtered_paintings_data_wo_duplicates = []

for painting_index in range(len(filtered_paintings_data)):
    kept_objects = {}

    for obj, obj_data in filtered_paintings_data[painting_index]["objects"].items():
        if (
            obj_data["description"] in duplicated_object_descriptions_data.keys()
            and duplicated_object_descriptions_data[obj_data["description"]][0]
            == filtered_paintings_data[painting_index]["id"]
            and duplicated_object_descriptions_data[obj_data["description"]][1] == obj
            and duplicated_object_descriptions_data[obj_data["description"]][2]
            == min([bbox[0] for bbox in obj_data["bounding_boxes"]])
        ) or obj_data["description"] not in duplicated_object_descriptions_data.keys():
            kept_objects[obj] = obj_data

    if len(kept_objects) != 0:
        filtered_paintings_data[painting_index]["objects"] = kept_objects
        filtered_paintings_data_wo_duplicates.append(filtered_paintings_data[painting_index])

In [None]:
object_descriptions = []

for painting_data in filtered_paintings_data_wo_duplicates:
    for obj, obj_data in painting_data["objects"].items():
        object_descriptions.append(obj_data["description"])

duplicates = Counter(object_descriptions)
duplicated_object_descriptions = []

for obj_description, freq in duplicates.items():
    if freq > 1:
        duplicated_object_descriptions.append(obj_description)

print(
    f"Number of duplicated descriptions that have to be removed: {len(object_descriptions) - len(set(object_descriptions))} "
)
print(
    f"Number of unique descriptions that appear more than once: {len(duplicated_object_descriptions)}"
)
print(f"Number of paintings removed: {len(filtered_paintings_data) - len(filtered_paintings_data_wo_duplicates)}")

In [None]:
with open(f"{PROCESSED_DATA_PATH}paintings_with_filtered_objects.json", "w") as f:
    json.dump(filtered_paintings_data_wo_duplicates, f, indent=4)