# Annotate paintings
This notebook is used to compare the annotation results on the mini-sets w & w/o the judge and before & after prompt engineering.

### 0. Import libraries

In [None]:
import os
import json
import polars as pl
import plotly.express as px

RESULTS_PATH = "../../experiments/prompting/"
COLORS = ["#acb0e0", "#bcd5c3", "#cd968e", "#d7d8d3"]

### 1. Load results

In [None]:
baseline_test_input_wo_judge = []
baseline_val_input_wo_judge = []
baseline_test_input_w_judge = []
baseline_val_input_w_judge = []
enhanced_test_input_w_judge = []
enhanced_val_input_w_judge = []
enhanced_test_input_wo_judge = []
enhanced_val_input_wo_judge = []

for filename in os.listdir(RESULTS_PATH):
    with open(RESULTS_PATH + filename) as f:
        metrics = json.load(f)

    if "mini_test_set_baseline" in filename and "wo_feedback" in filename:
        baseline_test_input_wo_judge.append(metrics)

    elif "mini_val_set_baseline" in filename and "wo_feedback" in filename:
        baseline_val_input_wo_judge.append(metrics)

    if "mini_test_set_baseline" in filename and "w_feedback" in filename:
        baseline_test_input_w_judge.append(metrics)

    elif "mini_val_set_baseline" in filename and "w_feedback" in filename:
        baseline_val_input_w_judge.append(metrics)

    elif "mini_val_set_enhanced" in filename and "w_feedback" in filename:
        enhanced_val_input_w_judge.append(metrics)

    elif "mini_test_set_enhanced" in filename and "w_feedback" in filename:
        enhanced_test_input_w_judge.append(metrics)

    elif "mini_val_set_enhanced" in filename and "wo_feedback" in filename:
        enhanced_val_input_wo_judge.append(metrics)

    elif "mini_test_set_enhanced" in filename and "wo_feedback" in filename:
        enhanced_test_input_wo_judge.append(metrics)

In [None]:
def get_results(results, experiment_name, set_name):
    metrics = {
        "total_token_count_annotator": [],
        "total_token_count_judge": [],
        "micro_f1_objects": [],
        "micro_f1_spans": [],
        "cosine similarity": [],
        "Levenshtein distance": [],
        "delete percentage": [],
        "false positive percentage": [],
        "coverage percentage": [],
        "map_50": [],
        "map_50_95": [],
    }

    for result in results:
        for metric in metrics.keys():
            if metric in result.keys():
                metrics[metric].append(result[metric])
            else:
                metrics[metric].append(result["span_similarity_metrics"][metric])

    return (
        pl.from_dict(metrics)
        .with_columns(pl.lit(experiment_name).alias("experiment"))
        .with_columns(pl.lit(set_name).alias("set"))
    )

In [None]:
baseline_test = get_results(baseline_test_input_w_judge, "baseline prompt & judge", "test")
baseline_val = get_results(baseline_val_input_w_judge, "baseline prompt & judge", "val")

baseline_test_wo_judge = get_results(baseline_test_input_wo_judge, "baseline prompt", "test")
baseline_val_wo_judge = get_results(baseline_val_input_wo_judge, "baseline prompt", "val")

enhanced_test = get_results(enhanced_test_input_w_judge, "prompt engineering & judge", "test")
enhanced_val = get_results(enhanced_val_input_w_judge, "prompt engineering & judge", "val")

enhanced_test_wo_judge = get_results(enhanced_test_input_wo_judge, "prompt engineering", "test")
enhanced_val_wo_judge = get_results(enhanced_val_input_wo_judge, "prompt engineering", "val")

all_results = pl.concat(
    [
        baseline_test,
        baseline_val,
        baseline_test_wo_judge,
        baseline_val_wo_judge,
        enhanced_test,
        enhanced_val,
        enhanced_test_wo_judge,
        enhanced_val_wo_judge,
    ]
)
all_results

### 2. Preprocess results

In [None]:
for metric in ["Levenshtein distance", "delete percentage", "false positive percentage"]:
    all_results = all_results.with_columns((1 / pl.col(metric)).alias(f"1/{metric}")).drop(metric)
all_results

In [None]:
for metric in [
    "cosine similarity",
    "coverage percentage",
    "1/Levenshtein distance",
    "1/delete percentage",
    "1/false positive percentage",
]:
    all_results = all_results.with_columns(
        (pl.col(metric) * pl.col("micro_f1_spans")).alias(f"weighted {metric}")
    ).drop(metric)
all_results

In [None]:
agg_results = all_results.group_by("experiment", "set").mean()
display(agg_results.filter(pl.col("set") == "test").sort("experiment"))#.select("experiment", "micro_f1_objects", "micro_f1_spans", "map_50", "weighted 1/Levenshtein distance"))
display(agg_results.filter(pl.col("set") == "val").sort("experiment"))#.select("experiment", "micro_f1_objects", "micro_f1_spans", "map_50", "weighted 1/Levenshtein distance"))

In [None]:
fig = px.strip(
    all_results,
    x="set",
    y="micro_f1_objects",
    color="experiment",
    color_discrete_sequence=COLORS,
    labels={"set": "Dataset", "micro_f1_objects": "Metric Value", "experiment": "Pipeline Type"},
    title="Comparison of Object-Level Micro F1 Scores",
).update_traces(marker=dict(size=18))
fig.show()

In [None]:
fig = px.strip(
    all_results,
    x="set",
    y="map_50",
    color="experiment",
    color_discrete_sequence=COLORS,
    labels={"set": "Dataset", "map_50": "Metric Value", "experiment": "Pipeline Type"},
    title="Comparison of mAP@50 Scores",
).update_traces(marker=dict(size=18))
fig.show()

In [None]:
fig = px.strip(
    all_results,
    x="set",
    y="weighted 1/Levenshtein distance",
    color="experiment",
    color_discrete_sequence=COLORS,
    labels={"set": "Dataset", "weighted 1/Levenshtein distance": "Metric Value", "experiment": "Pipeline Type"},
    title="Comparison of weighted 1/Levenshtein distance Scores",
).update_traces(marker=dict(size=18))
fig.show()