# Analyze annotated dataset

### 0. Import libraries and data

In [None]:
import sys
import copy
import json
import random
import polars as pl
from tqdm import tqdm
import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud

sys.path.append("../annotate_dataset/")

ANNOTATIONS_PATH = "../../data/annotations/"
INTERMEDIATE_DATA_PATH = "../../data/intermediate/filtered_paintings/"
PROCESSED_DATA_PATH = "../../data/processed/"

random.seed(42)
COLORS = ["#cd968e", "#acb0e0", "#aecbdc", "#bcd5c3", "#bfbfbf"]

from ground_objects import *
from annotate_paintings_utils import *

In [None]:
with open(f"{ANNOTATIONS_PATH}unfiltered_annotations.json") as f:
    unfiltered_annotations = json.load(f)

for annotation_index in range(len(unfiltered_annotations)):
    for obj, desc in unfiltered_annotations[annotation_index]["objects"].items():
        unfiltered_annotations[annotation_index]["objects"][obj] = {"description": desc[-1], "bounding_boxes": []}

    for bbox in unfiltered_annotations[annotation_index]["bounding_boxes"]:
        unfiltered_annotations[annotation_index]["objects"][bbox[0]]["bounding_boxes"].append([bbox[1], bbox[2]])

    del unfiltered_annotations[annotation_index]["bounding_boxes"]
    del unfiltered_annotations[annotation_index]["extraction_judgement"]
    del unfiltered_annotations[annotation_index]["description_judgement"]

with open(f"{PROCESSED_DATA_PATH}paintings_with_all_objects.json") as f:
    filtered_annotation_all_objects = json.load(f)

with open(f"{PROCESSED_DATA_PATH}paintings_with_filtered_objects.json") as f:
    filtered_annotation_filtered_objects = json.load(f)

### 1. Statistics of the initial dataset

The initial dataset has 12078 paintings. \
More info can be found in the analyze_filtered_dataset.ipynb notebook.

In [None]:
def compute_statistics(annotations):
    obj_no = 0
    obj_with_bbox = 0
    obj_with_desc = 0
    obj_with_bbox_desc = 0

    bbox_no_per_object = []
    obj_no_per_painting = []
    bbox_no_per_painting = []
    non_empty_description_words_no = []

    for annotation in annotations:
        obj_no_per_painting.append(len(annotation["objects"]))

        painting_bbox_no = 0

        if len(annotation["objects"]) == 0:
            continue

        for _, desc in annotation["objects"].items():
            obj_no += 1

            if len(desc["description"].split(" ")) != 0 and len(desc["description"]) != 0:
                non_empty_description_words_no.append(len(desc["description"].split(" ")))
            
            if len(desc["description"]) != 0 and len(desc["bounding_boxes"]) != 0:
                obj_with_bbox_desc += 1

            if len(desc["description"]) != 0:
                obj_with_desc += 1
            
            if len(desc["bounding_boxes"]) != 0:
                obj_with_bbox += 1
                bbox_no_per_object.append(len(desc["bounding_boxes"]))
                painting_bbox_no += len(desc["bounding_boxes"])

        bbox_no_per_painting.append(painting_bbox_no)

    print(f"Number of paintings: {len(annotations)}")
    print(f"Number of objects: {obj_no}")
    print(f"Number of objects with descriptions: {obj_with_desc} ({obj_with_desc / obj_no * 100}%)")
    print(f"Number of objects with bounding boxes: {obj_with_bbox} ({obj_with_bbox / obj_no * 100}%)")
    print(f"Number of objects with bounding boxes and descriptions: {obj_with_bbox_desc} ({obj_with_bbox_desc / obj_no * 100}%)")
    print(f"Number of bounding boxes: {sum(bbox_no_per_object)}")

    all_statistics = []

    for measurement_name, measurement in [["bbox_no_per_object", bbox_no_per_object], ["obj_no_per_painting", obj_no_per_painting], ["bbox_no_per_painting", bbox_no_per_painting], ["non_empty_description_words_no", non_empty_description_words_no]]:
        statistics = pl.DataFrame(measurement).describe().transpose()
        statistics = statistics.rename({old_col: new_col for old_col, new_col in zip(statistics.columns, statistics.row(0))})
        all_statistics.append(statistics.slice(1, statistics.height - 1).with_columns(pl.all().cast(pl.Float32)).with_columns(pl.Series([measurement_name]).alias("name")))

    all_statistics = pl.concat(all_statistics).drop("count", "null_count")
    display(all_statistics)

    fig = px.box(
    x=non_empty_description_words_no,
    color_discrete_sequence=COLORS[1:2],
    title="Number of words per non-empty description"
    )

    fig.show()

### 2. Statistics after running the annotation pipeline

- The final number of paintings for which the annotator did not work correctly: 405 (3.35%)
- The final number of paintings for which the judge did not work correctly: 68 (0.56%)
- The final number of paintings without objects: 484 (4.00%)
- The final number of of annotated paintings: 11121 (92.07%)

In [None]:
compute_statistics(unfiltered_annotations)

### 2. Statistics after filtering based on judge scores, after refining bounding boxes

In [None]:
compute_statistics(filtered_annotation_all_objects)

### 3. Statistics of the final dataset where each object has a description and at least a bounding box

In [None]:
compute_statistics(filtered_annotation_filtered_objects)

The paintings that were kept have the same distribution for style, genre, source etc.\
Even the distributions of the number of words per description from each source remained the same. \
Last but not least, the paintings with the longest descriptions were kept. 

#### 3.1. Analyze the distribution of final object labels

In [None]:
object_labels = []

for annotation in filtered_annotation_filtered_objects:
    object_labels.append(list(annotation["objects"].keys()))
    
paintings_objects = copy.deepcopy(filtered_annotation_filtered_objects)

for index, painting_objects in enumerate(paintings_objects):
    paintings_objects[index]["objects"] = object_labels[index]

paintings_objects = pl.from_dicts(paintings_objects).explode("objects")
paintings_objects

In [None]:
# most frequent object names
unique_labels = paintings_objects.group_by("objects").len().sort("len", descending=True)
print(f"The number of unique labels: {unique_labels.shape[0]}")

with pl.Config(tbl_rows=100):
    display(unique_labels[:100])

In [None]:
# number of object names that appear at least n times (in n paintings)
for min_appearances in range(1, 11):
    print(f"Object names with minimum {min_appearances * 10} appearances: {unique_labels.filter((pl.col("len") > min_appearances * 10)).shape[0]}")

In [None]:
# frequency of object names by their number of appearances
fig = px.histogram(
x=unique_labels["len"],
color_discrete_sequence=COLORS[1:2],
title="Distribution of labels by their number of appearances"
)
fig.update_layout(xaxis_title="number of appearances")

fig.show()

In [None]:
def analyze_object_distribution(paintings_objects, object_name):
    for col in ["artist", "coarse_type", "first_style", "second_style", "source"]:
        display(paintings_objects.filter(pl.col("objects") == object_name)[col].value_counts().sort("count", descending=True))

In [None]:
# it seems that an object name can be found in paintings of all types
# there isn't a tendancy for an object name to appear only in one kind of paintings
analyze_object_distribution(paintings_objects, "diana")

In [None]:
def random_color_func(word, font_size, position, orientation, random_state, **kwargs):
    return random.choice(COLORS)

text = " ".join([obj.replace(" ", "_") for obj in paintings_objects["objects"].to_list()])

wordcloud = WordCloud(
    width=1980,          
    height=1080,       
    background_color="white", 
    min_font_size=20,    
    max_words=158,      
    collocations=False,
    color_func=random_color_func 
).generate(text)

plt.figure(figsize=(19, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off") 
plt.show() 

### 4. Visual analysis of the annotated paintings

In [None]:
def print_description(description, words_per_line=15):   
    description_words = description.replace("\n", " ").split(" ")

    for i in range(0, len(description_words), words_per_line):
        line = " ".join(description_words[i:i + words_per_line])
        print(line)

In [None]:
random.shuffle(filtered_annotation_filtered_objects)

for annotation in filtered_annotation_filtered_objects[:100]:
    bboxes = []
    print(annotation["id"])

    for obj, obj_data in annotation["objects"].items():
        print_description(f"{obj} -> {obj_data["description"]}")
        bboxes.extend([[obj, bbox[0], bbox[1]] for bbox in obj_data["bounding_boxes"]])

    _, image = load_image(annotation["id"])
    display_annotated_image(copy.deepcopy(image), bboxes, True)
