# Annotate paintings
This notebook leverages an LLM to extract the described objects that appear in a painting and then ground them with Grounding DINO.

### 0. Import libraries and set the configuration

In [None]:
%load_ext autoreload
%autoreload 2

from tqdm import tqdm

from call_llm import *
from ground_objects import *
from compute_metrics import *
from judge_annotations import *
from annotate_paintings_utils import *

In [None]:
GEMINI_MODEL = "gemini-2.0-flash-lite"
OPEN_AI_MODEL = "gpt-4.1-nano-2025-04-14"
SENTENCE_SIMILARITY_MODEL_NAME = "all-mpnet-base-v2"
GROUNDING_MODEL_ID = "IDEA-Research/grounding-dino-base"

### 1. Import models and data

In [None]:
# get device type
device = get_device()

# load models
llm_client = get_llm_client()
judge_client = get_judge_llm_client()
grounding_processor, grounding_model = get_grounding_model(GROUNDING_MODEL_ID, device)
sentence_similarity_model = get_sentence_similarity_model(SENTENCE_SIMILARITY_MODEL_NAME)

# load data
paintings_data, annotations, few_shot_examples, test_paintings = load_data()

# if an image is not included, it doesn't have annotations
ground_truth_bboxes, labels_to_ids = get_bbox_annotations()

### 2. Experiment with annotation prompts

In [None]:
verbose = True
prompt_type = "basic_complete"
observations = "first trial with few-shot learning"

In [None]:
# object metrics
micro_f1_objects = None
tp_fp_fn_objects = [0, 0, 0]
all_predicted_objects = []
all_ground_truth_objects = []

# spans metrics
micro_f1_spans = None
tp_fp_fn_spans = [0, 0, 0]
span_similarity_metrics = {
    "cosine similarity": [],
    "Levenshtein distance": [],
    "delete percentage": [],
    "false positive percentage": [],
    "coverage percentage": [],
}
all_predicted_spans = []
all_ground_truth_spans = []

# object descriptions metrics
object_description_metrics = {
     "factual_accuracy": [],
     "coherence": [],
     "grounding_potential": [],
     "completeness": []
}
all_predicted_object_descriptions = []

# grounding metrics
map_50 = None
map_50_95 = None
all_predicted_bboxes = []
all_ground_truth_bboxes = []

# other tracked info
total_token_count = 0
painting_ids = []
unprocessed_painting_ids = []

for painting in tqdm(test_paintings[:5]):
    painting_ids.append(painting["painting_id"])
    image = load_image(painting["painting_id"])

    # extract described objects
    llm_output, token_count = generate(
        llm_client,
        few_shot_examples,
        image,
        painting["description"],
        prompt_type,
        GEMINI_MODEL,
        verbose,
    )
    total_token_count += token_count

    if llm_output is None:
        unprocessed_painting_ids.append(painting["painting_id"])
        continue

    spans_are_extracted = "description_spans" in llm_output[0].__dict__
    description_is_extracted = "object_description" in llm_output[0].__dict__
    sorted(llm_output, key=lambda x: x.object_name)

    # handle objects
    predicted_objects, ground_truth_objects = process_objects(
        llm_output, painting, all_predicted_objects, all_ground_truth_objects, verbose
    )
    compute_f1(predicted_objects, ground_truth_objects, tp_fp_fn_objects)

    # handle spans
    if spans_are_extracted:
        predicted_spans_per_object, ground_truth_spans_per_object, predicted_spans, ground_truth_spans = process_spans(llm_output, painting)
        compute_spans_quality(
            ground_truth_spans_per_object,
            predicted_spans_per_object,
            span_similarity_metrics,
            sentence_similarity_model,
            verbose
        )
        compute_f1(predicted_spans, ground_truth_spans, tp_fp_fn_spans)
        all_predicted_spans.append(predicted_spans_per_object)
        all_ground_truth_spans.append(ground_truth_spans_per_object)

    # handle object description
    if description_is_extracted:
        get_object_descriptions(llm_output, all_predicted_object_descriptions)
        judge_objects_descriptions(judge_client, OPEN_AI_MODEL, llm_output, object_description_metrics)

    # ground objects
    labels_scores_boxes, results = detect_objects(
        image,
        predicted_objects,
        grounding_processor,
        grounding_model,
        device,
        verbose,
        object_threshold=0.3,
        text_threshold=0.3,
    )

    get_bounding_boxes(
        labels_scores_boxes,
        labels_to_ids,
        ground_truth_bboxes,
        painting["painting_id"],
        all_predicted_bboxes,
        all_ground_truth_bboxes,
        device,
    )

# compute metrics across the entire dataset
micro_f1_objects = compute_micro_f1(tp_fp_fn_objects, "objects", verbose)

if spans_are_extracted:
    micro_f1_spans = compute_micro_f1(tp_fp_fn_spans, "spans", verbose)
    for metric in span_similarity_metrics:
        span_similarity_metrics[metric] = np.array(span_similarity_metrics[metric]).mean()

if description_is_extracted:
    for metric in object_description_metrics:
        object_description_metrics[metric] = np.array(object_description_metrics[metric]).mean()

map_50, map_50_95 = compute_mean_average_precision(
    all_predicted_bboxes, all_ground_truth_bboxes, device, verbose
)

In [None]:
# store results for the tested prompt
predictions = []

for index, painting_data in enumerate(all_predicted_spans):
    object_descriptions = all_predicted_object_descriptions[index]
    object_data = {}

    for object_index, (object, spans) in enumerate(painting_data.items()):
        object_data[object] = [spans, object_descriptions[object_index]]
    
    predictions.append(object_data)

results_values = list(zip(painting_ids, predictions, all_ground_truth_spans))

metrics = {
    "total_token_count": total_token_count,
    "unprocessed_painting_ids": unprocessed_painting_ids,
    "micro_f1_objects": micro_f1_objects,
    "micro_f1_spans": micro_f1_spans,
    "span_similarity_metrics": span_similarity_metrics,
    "object_description_metrics": object_description_metrics,
    "map_50": map_50,
    "map_50_95": map_50_95,
}

store_results(prompt_type, observations, results_values, metrics)

- TODO: handle empty detections for a painting
- TODO: how to treat the case when llm_output is None?
- TODO: get and store the number of tokens processed by the judge
- TODO: change back the LLMs

### 3. Create object-descriptions for few-shot examples

In [None]:
class Description(BaseModel):
    description: str

In [None]:
system_prompt_text = """You are an art expert providing detailed descriptions of objects depicted in paintings. You are given an object name and a set of descriptive text spans from the painting's museum label. Your task is to create a single, coherent description paragraph that starts with the object name of the object based solely on the provided information. You have to included all the provided details from the description spans.

**Constraints:**
Do not add any details about the object that are not explicitly mentioned in the provided description spans.
Do not infer the object's material, purpose, or origin unless it is directly stated in the text.
Focus on combining and rephrasing the given information, not on creating new information.
Do not assume anything about the object's cultural significance or symbolism unless the provided spans mention it."""

In [None]:
_, _, few_shot_examples, _ = load_data()

for few_shot_example in few_shot_examples:
    print(few_shot_example["painting_id"])

    for index in range(len(few_shot_example["object_name"])):

        object_name = few_shot_example["object_name"][index]
        description_spans = "- " + "\n- ".join(few_shot_example["description_spans"][index])
        
        if len(description_spans) == 2:
            print(object_name)
            print()
            continue

        prompt_parts_text = f"""**Object Name:**\n{object_name}\n\n**Description Spans:**\n{description_spans}\n\n**Generated Description:**"""

        generate_content_config = types.GenerateContentConfig(
            temperature=0.0,
            response_mime_type="application/json",
            system_instruction=[
                types.Part.from_text(text=system_prompt_text),
            ],
            response_schema=Description,
        )

        called = False

        while not called:
            try:
                response = llm_client.models.generate_content(
                    model=GEMINI_MODEL,
                    contents=prompt_parts_text,
                    config=generate_content_config,
                )
                called = True
            except:
                print("Try again...")
                time.sleep(5)

        print(object_name)
        print(response.parsed.description)