# Annotate paintings
This notebook leverages an LLM to extract the described objects that appear in a painting and then ground them with Grounding DINO.

### 0. Import libraries and set the configuration

In [None]:
import copy
from tqdm import tqdm

from call_llm import *
from ground_objects import *
from compute_metrics import *
from annotate_paintings_utils import *

In [None]:
GEMINI_MODEL = "gemini-2.0-flash"
SENTENCE_SIMILARITY_MODEL_NAME = "all-mpnet-base-v2"
GROUNDING_MODEL_ID = "IDEA-Research/grounding-dino-base"

### 1. Import models and data

In [None]:
# get device type
device = get_device()

# load models
llm_client = get_llm_client()
grounding_processor, grounding_model = get_grounding_model(GROUNDING_MODEL_ID, device)
sentence_similarity_model = get_sentence_similarity_model(SENTENCE_SIMILARITY_MODEL_NAME)

# load data
paintings_data, annotations, few_shot_examples, test_paintings = load_data()

# if an image is not included, it doesn't have annotations
ground_truth_bboxes, labels_to_ids = get_bbox_annotations()

### 2. Experiment with annotation prompts

In [None]:
verbose = True
prompt_type = "basic"
observations = "first trial with few-shot learning"

In [None]:
tp_fp_fn = [0, 0, 0]
total_token_count = 0

painting_ids = []
all_predicted_objects = []
all_ground_truth_objects = []

predictions = []
targets = []

for painting in tqdm(test_paintings[:1]):
    painting_id = painting["painting_id"]
    painting_ids.append(painting_id)
    ground_truth_objects = copy.deepcopy(painting["object_name"])
    description = copy.deepcopy(painting["description"])
    image = load_image(painting_id)

    # extract described objects
    predicted_objects, token_count = generate(
        llm_client,
        few_shot_examples,
        image,
        description,
        prompt_type,
        GEMINI_MODEL,
        verbose,
    )
    total_token_count += token_count

    # compute metrics
    predicted_objects, ground_truth_objects = clean_labels(predicted_objects, ground_truth_objects)
    all_predicted_objects.append(predicted_objects)
    all_ground_truth_objects.append(ground_truth_objects)

    if verbose:
        print(predicted_objects, ground_truth_objects)

    compute_f1(copy.deepcopy(predicted_objects), copy.deepcopy(ground_truth_objects), tp_fp_fn)

    # ground objects
    labels_scores_boxes, results = detect_objects(
        image,
        predicted_objects,
        grounding_processor,
        grounding_model,
        device,
        verbose,
        object_threshold=0.3,
        text_threshold=0.3,
    )

    prediction, target = get_bounding_boxes(
        labels_scores_boxes, labels_to_ids, ground_truth_bboxes, painting_id, device
    )
    predictions.append(prediction)
    targets.append(target)


micro_f1 = compute_micro_f1(tp_fp_fn, verbose)
compute_mean_average_precision = compute_mean_average_precision(predictions, targets, device, verbose)
print(f"Total token count: {total_token_count}")


In [None]:
# store results for the tested prompt
results_values = list(zip(painting_ids, all_predicted_objects, all_ground_truth_objects))
store_results(micro_f1, results_values, prompt_type, observations)

In [None]:
ground_truth_span = "The c sat a on the mat x."
extracted_span = "The cat sat on the mat."
span_extraction_metrics = compare_spans(
    ground_truth_span, extracted_span, sentence_similarity_model, verbose
)

TODO: compute the F1 score for spans