# Filter annotations
Filter short descriptions, filter annotations based on the judge's scores and analyze the annotation quality at the painting level.

### 0. Import libraries and load data

In [None]:
%load_ext autoreload
%autoreload 2

import sys
import json 
import numpy as np
import polars as pl
from pprint import pprint
import plotly.express as px

COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]
ANNOTATIONS_PATH = "../../data/annotations/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/filtered_paintings/"

sys.path.append("../annotate_dataset/")

from ground_objects import *
from annotate_paintings_utils import *

In [None]:
with open(f"{ANNOTATIONS_PATH}unfiltered_annotations.json") as f:
    all_annotations = json.load(f)

paintings_data = pl.read_json(f"{INTERMEDIATE_DATA_PATH}filtered_paintings_enhanced_data.json")

### 1. Filter short descriptions, filter judge outputs and re-compute judge scores

#### 1.1. Filter short descriptions

In [None]:
def is_informative_text(obj, text):
    informative_words = [
        word
        for word in list(
            set(clean_object_name(text).split(" ")).difference(
                set(clean_object_name(obj).split(" "))
            )
        )
        if len(word) != 0
    ]

    if len(informative_words) <= 1 and len(text) > 0:
        # print(obj, "|", text, "|", informative_words)
        return False

    return True

In [None]:
for annotation_index in range(len(all_annotations)):
    kept_description_judgement = {
        "factual_accuracy": [],
        "coherence": [],
        "grounding_potential": [],
        "completeness": [],
    }

    for index, (obj, desc) in enumerate(
        {
            obj: desc
            for obj, desc in all_annotations[annotation_index]["objects"].items()
            if len(desc[1]) != 0
        }.items()
    ):
        if not is_informative_text(obj, desc[-1]):
            all_annotations[annotation_index]["objects"][obj] = [[""], ""]

        else:
            description_judgement = all_annotations[annotation_index]["description_judgement"]
            for criterion in kept_description_judgement.keys():
                kept_description_judgement[criterion].append(
                    description_judgement[criterion][index]
                )

    all_annotations[annotation_index]["description_judgement"]["factual_accuracy"] = (
        kept_description_judgement["factual_accuracy"]
    )
    all_annotations[annotation_index]["description_judgement"]["coherence"] = (
        kept_description_judgement["coherence"]
    )
    all_annotations[annotation_index]["description_judgement"]["grounding_potential"] = (
        kept_description_judgement["grounding_potential"]
    )
    all_annotations[annotation_index]["description_judgement"]["completeness"] = (
        kept_description_judgement["completeness"]
    )

#### 1.2. Filter judge outputs and re-compute judge scores

In [None]:
def match_fuzzy(object_name, objects):
    cleaned_object = clean_object_name(object_name)

    for current_object in objects:
        cleaned_current_object = clean_object_name(current_object)

        if set(cleaned_object.split(" ")).issubset(set(cleaned_current_object.split(" "))) or set(
            cleaned_current_object.split(" ")
        ).issubset(set(cleaned_object.split(" "))):
            return True

    return False

In [None]:
def is_sublist_slice(object_words, description_words):
    descriptions_worlds_no = len(description_words)
    objects_words_no = len(object_words)

    if objects_words_no == 0:
        return True

    if objects_words_no > descriptions_worlds_no:
        return False

    for i in range(descriptions_worlds_no - objects_words_no + 1):
        if description_words[i : i + objects_words_no] == object_words:
            return True

    return False

In [None]:
for annotation_index in range(len(all_annotations)):
    annotation = all_annotations[annotation_index]
    extraction_judgement = annotation["extraction_judgement"]

    description = paintings_data.filter(pl.col("id") == annotation["painting_id"])["description"][0]

    tp_objects = list(annotation["objects"].keys())

    tp_spans = []

    for spans in annotation["objects"].values():

        tp_spans.extend([span for span in spans[0] if len(span) != 0])
    fn_objects = []

    for fn_object in extraction_judgement["false_negative_objects"]:

        if not match_fuzzy(fn_object, tp_objects) and is_sublist_slice(
            fn_object.split(" "), clean_object_name(description).split(" ")
        ):

            fn_objects.append(fn_object)
    for fn_object, _ in extraction_judgement["false_negative_spans"]:

        if (
            not match_fuzzy(fn_object, tp_objects)
            and is_sublist_slice(fn_object.split(" "), clean_object_name(description).split(" "))
            and fn_object not in fn_objects
        ):

            fn_objects.append(fn_object)
    fn_spans = []

    for object_name, spans in extraction_judgement["false_negative_spans"]:

        if object_name in fn_objects:

            kept_spans = []

            for span in spans:

                if (
                    is_informative_text(object_name, span)
                    and span in description
                    and len(span) != 0
                    and not match_fuzzy(span, tp_spans)
                ):

                    kept_spans.append(span)
            if len(kept_spans) != 0:

                fn_spans.append([object_name, kept_spans])
    fp_objects = []

    for fp_object in extraction_judgement["false_positive_objects"]:

        if not match_fuzzy(fp_object, extraction_judgement["false_negative_objects"]) and is_sublist_slice(
            fp_object.split(" "), clean_object_name(description).split(" ")
        ):

            fp_objects.append(fp_object)
    for fp_object, _ in extraction_judgement["false_positive_spans"]:

        if (
            not match_fuzzy(fp_object, extraction_judgement["false_negative_objects"])
            and is_sublist_slice(fp_object.split(" "), clean_object_name(description).split(" "))
            and fp_object not in fp_objects
        ):

            fp_objects.append(fp_object)
    fp_spans = []

    for object_name, spans in extraction_judgement["false_positive_spans"]:

        if object_name in fp_objects:

            kept_spans = []

            for span in spans:

                if (
                    is_informative_text(object_name, span)
                    and span in description
                    and span in tp_spans
                    and len(span) != 0
                ):

                    kept_spans.append(span)
            if len(kept_spans) != 0:

                fp_spans.append([object_name, kept_spans])
    flatten_fp_spans = []

    for fp_span in fp_spans:

        flatten_fp_spans.extend(fp_span[1])
    flatten_fn_spans = []

    for fn_span in fn_spans:

        flatten_fn_spans.extend(fn_span[1])
    tpo_count = len(
        list(set(tp_objects).difference(set([clean_object_name(object_name) for object_name in fp_objects])))
    )

    tps_count = len(list(set(tp_spans).difference(set(flatten_fp_spans))))

    fpo_count = len(fp_objects)

    fps_count = len(flatten_fp_spans)

    fno_count = len(fn_objects)

    fns_count = len(flatten_fn_spans)

    if tpo_count + fpo_count != 0:

        objects_precision = tpo_count / (tpo_count + fpo_count)
    else:

        objects_precision = 0.0
    if fps_count == 0:

        spans_precision = 1.0
    elif tps_count + fps_count != 0:

        spans_precision = tps_count / (tps_count + fps_count)
    else:

        spans_precision = 0.0
    if tpo_count + fno_count != 0:

        objects_recall = tpo_count / (tpo_count + fno_count)
    else:

        objects_recall = 0.0
    if fns_count == 0:

        spans_recall = 1.0
    elif tps_count + fns_count != 0:

        spans_recall = tps_count / (tps_count + fns_count)
    else:

        spans_recall = 0.0
    all_annotations[annotation_index]["extraction_judgement"]["false_negative_objects"] = fn_objects

    all_annotations[annotation_index]["extraction_judgement"]["false_negative_spans"] = fn_spans

    all_annotations[annotation_index]["extraction_judgement"]["false_positive_objects"] = fp_objects

    all_annotations[annotation_index]["extraction_judgement"]["false_positive_spans"] = fp_spans

    all_annotations[annotation_index]["extraction_judgement"]["objects_precision"] = objects_precision

    all_annotations[annotation_index]["extraction_judgement"]["objects_recall"] = objects_recall

    all_annotations[annotation_index]["extraction_judgement"]["spans_recall"] = spans_recall

    all_annotations[annotation_index]["extraction_judgement"]["spans_precision"] = spans_precision


### 2. Compute aggregated judge scores

In [None]:
judge_scores = {
    "painting_id": [],
    "objects_recall": [],
    "objects_precision": [],
    "objects_f0.5": [],
    "spans_recall": [],
    "spans_precision": [],
    "spans_f0.5": [],
    "extraction_score": [],
    "factual_accuracy": [],
    "coherence": [],
    "completeness": [],
    "description_score": [],
    "annotation_score": [],
}

for annotation in all_annotations:
    judge_scores["painting_id"].append(annotation["painting_id"])
    judge_scores["objects_recall"].append(annotation["extraction_judgement"]["objects_recall"])
    judge_scores["objects_precision"].append(
        annotation["extraction_judgement"]["objects_precision"]
    )

    if judge_scores["objects_recall"][-1] + judge_scores["objects_precision"][-1] == 0:
        judge_scores["objects_f0.5"].append(0.0)
    else:
        judge_scores["objects_f0.5"].append(
            (
                (1 + 0.5**2)
                * judge_scores["objects_recall"][-1]
                * judge_scores["objects_precision"][-1]
            )
            / (judge_scores["objects_recall"][-1] + 0.5**2 * judge_scores["objects_precision"][-1])
        )

    judge_scores["spans_recall"].append(annotation["extraction_judgement"]["spans_recall"])
    judge_scores["spans_precision"].append(annotation["extraction_judgement"]["spans_precision"])

    if judge_scores["spans_recall"][-1] + judge_scores["spans_precision"][-1] == 0:
        judge_scores["spans_f0.5"].append(0.0)
    else:
        judge_scores["spans_f0.5"].append(
            ((1 + 0.5**2) * judge_scores["spans_recall"][-1] * judge_scores["spans_precision"][-1])
            / (judge_scores["spans_recall"][-1] + 0.5**2 * judge_scores["spans_precision"][-1])
        )

    judge_scores["extraction_score"].append(
        (judge_scores["objects_f0.5"][-1] + judge_scores["spans_f0.5"][-1]) / 2
    )

    if len(annotation["description_judgement"]["factual_accuracy"]) != 0:
        judge_scores["factual_accuracy"].append(
            np.array(annotation["description_judgement"]["factual_accuracy"]).mean()
        )
    else:
        judge_scores["factual_accuracy"].append(5.0)

    if len(annotation["description_judgement"]["coherence"]) != 0:
        judge_scores["coherence"].append(
            np.array(annotation["description_judgement"]["coherence"]).mean()
        )
    else:
        judge_scores["coherence"].append(5.0)

    if len(annotation["description_judgement"]["completeness"]) != 0:
        judge_scores["completeness"].append(
            np.array(annotation["description_judgement"]["factual_accuracy"]).mean()
        )
    else:
        judge_scores["completeness"].append(5.0)

    judge_scores["description_score"].append(
        (
            judge_scores["factual_accuracy"][-1]
            + judge_scores["coherence"][-1]
            + judge_scores["completeness"][-1]
        )
        / 3
    )

    judge_scores["annotation_score"].append(
        (
            (
                judge_scores["objects_f0.5"][-1]
                + judge_scores["spans_f0.5"][-1]
                + ((judge_scores["description_score"][-1] - 1) / 4)
            )
            / 3
        )
    )


judge_scores_df = pl.from_dict(judge_scores)
judge_scores_df

### 3. Analyze aggregated judge scores and filter the dataset

In [None]:
def get_iqr_bounds(scores):
    data_array = scores

    Q1 = np.percentile(data_array, 25)
    Q3 = np.percentile(data_array, 75)

    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    return lower_bound, upper_bound

In [None]:
def print_description(description, words_per_line=25):
    description_words = description.replace("\n", " ").split(" ")

    for i in range(0, len(description_words), words_per_line):
        line = " ".join(description_words[i : i + words_per_line])
        print(line)

In [None]:
def plot_score_distribution(judge_scores, score_name):
    fig = px.histogram(
        [val for val in judge_scores[score_name] if val != -1],
        marginal="box",
        title=f"Distribution of {score_name} scores",
        labels={"value": f"{score_name} value"},
        nbins=100,
        color_discrete_sequence=COLORS[1:2],
    )

    fig.update_traces(showlegend=False)
    fig.show()

In [None]:
def plot_scores_correlation_matrix(judge_scores_df):
    fig = px.imshow(

        judge_scores_df.select(
            "objects_f0.5",
            "spans_f0.5",
            "factual_accuracy",
            "coherence",
            "completeness",
            "description_score",
            "annotation_score",
        )
        .to_pandas()
        .corr(),

        text_auto=True,
        color_continuous_scale=[COLORS[0], COLORS[2]],
        range_color=[-1, 1],
        title="Correlation Matrix of Judge Evaluation Scores",

    )

    fig.update_layout(
        margin=dict(l=10, r=10, t=40, b=10),
        height=500 * 1.5,
        width=625 * 1.5,
        title_x=0.47,
    )



    fig.show()

In [None]:
plot_scores_correlation_matrix(judge_scores_df)

In [None]:
plot_score_distribution(judge_scores, "spans_f0.5")
plot_score_distribution(judge_scores, "objects_f0.5")
plot_score_distribution(judge_scores, "extraction_score")
plot_score_distribution(judge_scores, "factual_accuracy")
plot_score_distribution(judge_scores, "coherence")
plot_score_distribution(judge_scores, "completeness")
plot_score_distribution(judge_scores, "description_score")
plot_score_distribution(judge_scores, "annotation_score")

In [None]:
lower_bound, upper_bound = get_iqr_bounds(judge_scores["description_score"])
painting_ids = (
    judge_scores_df.filter((pl.col("description_score") < 3.5))
    # judge_scores_df.filter((pl.col("description_score") >= 3) & (pl.col("description_score") < 3.5))
    .sample(fraction=1.0, shuffle=True, seed=42)["painting_id"].to_list()[:3]
)
painting_ids

In [None]:
for painting_id in painting_ids:
    _, image = load_image(painting_id)

    annotation = [
        annotation for annotation in all_annotations if annotation["painting_id"] == painting_id
    ][0]
    description = paintings_data.filter(pl.col("id") == painting_id)["description"][0]

    print(annotation["painting_id"])
    print_description(description)
    pprint(annotation["objects"], indent=2)
    pprint(annotation["description_judgement"], indent=2)
    pprint(annotation["extraction_judgement"], indent=2)
    display_annotated_image(image, annotation["bounding_boxes"], True)

#### 3.1. Filter paintings with low-quality objects and spans

In [None]:
judge_scores_df = judge_scores_df.filter(
    ~(
        ((pl.col("objects_f0.5") == 0) & (pl.col("spans_f0.5") == 0))
        | (pl.col("extraction_score") <= 0.215)
    )
)
judge_scores = judge_scores_df.to_dict()

In [None]:
plot_scores_correlation_matrix(judge_scores_df)

In [None]:
plot_score_distribution(judge_scores, "spans_f0.5")
plot_score_distribution(judge_scores, "objects_f0.5")
plot_score_distribution(judge_scores, "extraction_score")
plot_score_distribution(judge_scores, "factual_accuracy")
plot_score_distribution(judge_scores, "coherence")
plot_score_distribution(judge_scores, "completeness")
plot_score_distribution(judge_scores, "description_score")
plot_score_distribution(judge_scores, "annotation_score")

In [None]:
coherence = []
completeness = []
factual_accuracy = []
kept_painting_ids = judge_scores["painting_id"].to_list()

for annotation_index in range(len(all_annotations)):
    if all_annotations[annotation_index]["painting_id"] not in kept_painting_ids:
        continue

    coherence.extend(all_annotations[annotation_index]["description_judgement"]["coherence"])
    completeness.extend(all_annotations[annotation_index]["description_judgement"]["completeness"])
    factual_accuracy.extend(all_annotations[annotation_index]["description_judgement"]["factual_accuracy"])

In [None]:
fig = px.histogram(
    coherence,
    marginal="box",
    nbins=100,
    color_discrete_sequence=COLORS[1:2],
)

fig.update_traces(showlegend=False)
fig.show()

#### 3.2. Filter descriptions with low quality

In [None]:
filtered_annotations = []
removed_descriptions_counter = 0
affected_paintings_counter = 0
kept_painting_ids = judge_scores["painting_id"].to_list()

for annotation_index in range(len(all_annotations)):
    if all_annotations[annotation_index]["painting_id"] not in kept_painting_ids:
        continue

    updated_annotation = {}
    updated_annotation["painting_id"] = all_annotations[annotation_index]["painting_id"]
    updated_annotation["bounding_boxes"] = all_annotations[annotation_index]["bounding_boxes"]

    updated_objects = {}
    description_judgement_index = 0

    removed_description = False
    for obj, desc in all_annotations[annotation_index]["objects"].items():
        if len(desc[-1]) == 0:
            updated_objects[obj] = desc
        else:
            if (
                all_annotations[annotation_index]["description_judgement"]["factual_accuracy"][
                    description_judgement_index
                ]
                >= 3
                and all_annotations[annotation_index]["description_judgement"]["coherence"][
                    description_judgement_index
                ]
                >= 3
                and all_annotations[annotation_index]["description_judgement"]["completeness"][
                    description_judgement_index
                ]
                >= 3
            ):
                updated_objects[obj] = desc
            else:
                updated_objects[obj] = [[""], ""]
                removed_descriptions_counter += 1
                removed_description = True

            description_judgement_index += 1

    if removed_description:
        affected_paintings_counter += 1

    updated_annotation["objects"] = updated_objects
    filtered_annotations.append(updated_annotation)

In [None]:
print(f"Number of descriptions removed: {removed_descriptions_counter}")
print(f"Number of affected paintings: {affected_paintings_counter}")

In [None]:
with open(f"{ANNOTATIONS_PATH}filtered_annotations.json", "w") as f:
    json.dump(filtered_annotations, f, indent=4)