# Merge annotations
This notebook is used to place all annotations in one json file.

### 0. Import libraries and load data

In [None]:
import os
import re
import json
import numpy as np
import polars as pl
from tqdm import tqdm
from collections import Counter
from transformers import AutoTokenizer

ANNOTATIONS_PATH = "../../data/annotations/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/filtered_paintings/"

In [None]:
regex_pattern = r"^annotations_.*\.json$"
annotation_chunks = []

for filename in os.listdir(ANNOTATIONS_PATH):
    if filename in "annotations_0_610.json":
        continue

    if re.match(regex_pattern, filename):
        with open(f"{ANNOTATIONS_PATH}{filename}", "r") as f:
            annotation_chunks.append(json.load(f))

### 1. Analyze results of the first trial

In [None]:
paintings_ids_unprocessed = []
paintings_ids_to_check = []
paintings_ids_wo_objects = []

In [None]:
for annotation_chunk in annotation_chunks:
    paintings_ids_unprocessed.extend(annotation_chunk["paintings_ids_unprocessed"])
    paintings_ids_to_check.extend(annotation_chunk["paintings_ids_to_check"])
    paintings_ids_wo_objects.extend(annotation_chunk["paintings_ids_wo_objects"])

In [None]:
indices = list(set(paintings_ids_unprocessed + paintings_ids_to_check))
print(
    f"The number of paintings for which the annotator did not work correctly: {len(set(paintings_ids_unprocessed))}"
)
print(
    f"The number of paintings for which the judge did not work correctly: {len(set(paintings_ids_to_check))}"
)
print(f"The number of paintings without objects: {len(set(paintings_ids_wo_objects))}")

### 2. Analyze results of the second trial
The paintings for which the annotator / judge initially failed, were processed again.

In [None]:
with open(f"{ANNOTATIONS_PATH}annotations_0_610.json", "r") as f:
    second_trial_annotations = json.load(f)

paintings_ids_unprocessed2 = set(second_trial_annotations["paintings_ids_unprocessed"])
paintings_ids_to_check2 = set(second_trial_annotations["paintings_ids_to_check"])
paintings_ids_wo_objects2 = set(paintings_ids_wo_objects).union(
    set(second_trial_annotations["paintings_ids_wo_objects"])
)

In [None]:
annotation_chunks.append(second_trial_annotations)
all_annotations = []

for current_annotations in annotation_chunks:
    all_annotations.extend(current_annotations["annotations"])

painting_ids_incorrect_judgements = set()

for annotation in all_annotations:
    desc = [desc[-1] for desc in annotation["objects"].values() if desc[-1] != ""]
    if not (
        len(desc)
        == len(annotation["description_judgement"]["factual_accuracy"])
        == len(annotation["description_judgement"]["coherence"])
        == len(annotation["description_judgement"]["completeness"])
        == len(annotation["description_judgement"]["grounding_potential"])
    ):
        painting_ids_incorrect_judgements.add(annotation["painting_id"])

In [None]:
seen_ids = set()
unique_annotations = []

for annotation in all_annotations:
    painting_id = annotation.get("painting_id")

    if painting_id not in seen_ids and painting_id not in painting_ids_incorrect_judgements:
        seen_ids.add(painting_id)
        unique_annotations.append(annotation)

In [None]:
print(
    f"The final number of paintings for which the annotator did not work correctly: {len(paintings_ids_unprocessed2)} ({len(paintings_ids_unprocessed2) / 12078 * 100}%)"
)
print(
    f"The final number of paintings for which the judge did not work correctly: {len(painting_ids_incorrect_judgements)} ({len(painting_ids_incorrect_judgements) / 12078 * 100}%)"
)
print(
    f"The final number of paintings without objects: {len(paintings_ids_wo_objects2)} ({len(paintings_ids_wo_objects2) / 12078 * 100}%)"
)
print(
    f"The final number of of annotated paintings: {len(unique_annotations)} ({len(unique_annotations) / 12078 * 100}%)"
)

#### 2.1. Analyze the cause of unprocessed paintings

In [None]:
unprocessed_paintings = pl.read_json(f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data.json").filter(pl.col("id").is_in(paintings_ids_unprocessed2))
unprocessed_paintings

In [None]:
# the large majority of unprocessed paintings are from MET
unprocessed_paintings["source"].value_counts().sort("count", descending=True)

In [None]:
desc_length = []
for desc in unprocessed_paintings["description"].to_list():
    desc_length.append(len(desc.split(" ")))

np.percentile(np.array(desc_length), 50)

In [None]:
def print_description(description, words_per_line=15):   
    description_words = description.replace("\n", " ").split(" ")

    for i in range(0, len(description_words), words_per_line):
        line = " ".join(description_words[i:i + words_per_line])
        print(line)

In [None]:
# the paintings that failed to be annotated have either description which do not present the painting or very long descriptions
for index in range(10):
    print(unprocessed_paintings[index]["id"][0])
    print_description(unprocessed_paintings[index]["description"][0])
    print(128 * "-")

### 3. Store results

In [None]:
with open(f"{ANNOTATIONS_PATH}unfiltered_annotations.json", "w") as f:
    json.dump(unique_annotations, f, indent=4)

In [None]:
sum([len(annotation["objects"]) for annotation in unique_annotations])

In [None]:
objects = []

_ = [objects.extend(list(annotation["objects"].keys())) for annotation in unique_annotations]
Counter(objects)

In [None]:
sum([len(annotation["bounding_boxes"]) for annotation in unique_annotations])

In [None]:
object_descriptions = []

_ = [
    object_descriptions.extend([objects[-1] for objects in list(annotation["objects"].values())])
    for annotation in unique_annotations
]

In [None]:
len(set(object_descriptions))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
descriptions_num_tokens = []

for object_description in tqdm(object_descriptions):
    tokens = tokenizer.tokenize(object_description)
    descriptions_num_tokens.append(len(tokens))

In [None]:
max(descriptions_num_tokens)

In [None]:
import plotly.graph_objects as go

# Create a Box trace
box_trace = go.Box(x=descriptions_num_tokens, name="My Data")

# Create a Figure and add the trace
fig = go.Figure(box_trace)

# Customize the layout (optional)
fig.update_layout(
    title="Box Plot of My Data",
    yaxis_title="Values",
    showlegend=False,  # Often not needed for a single box plot
)

# Show the plot
fig.show()

In [None]:
import numpy as np

In [None]:
np.percentile(np.array(descriptions_num_tokens), 99.5)

In [None]:
len(np.where(np.array(descriptions_num_tokens) > 256)[0])