# Judge annotations
This notebook is used for the development of the LLM-as-a-judge.

### 0. Import libraries

In [None]:
%load_ext autoreload
%autoreload 2

import os
import json
import base64
from pprint import pprint

from openai import OpenAI
from pydantic import BaseModel

from call_llm import *
from annotate_paintings_utils import *

### 1. LLM-as-a-judge for the object descriptions

In [None]:
def get_judge_llm_client():
    with open("../../config/keys.json", "r") as file:
        os.environ["OPENAI_API_KEY"] = json.load(file)["openai_api_key"]

    return OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
class ScoreExplanation(BaseModel):
    score: int
    explanation: str

In [None]:
class DescriptionScoreEvaluation(BaseModel):
    factual_accuracy: list[ScoreExplanation]
    coherence: list[ScoreExplanation]
    grounding_potential: list[ScoreExplanation]
    completeness: list[ScoreExplanation]

In [None]:
class DescriptionScore(BaseModel):
    factual_accuracy: list[int]
    coherence: list[int]
    grounding_potential: list[int]
    completeness: list[int]

In [None]:
def judge_objects_descriptions(client):
    system_prompt = """You are an expert evaluator assessing the quality of object descriptions from paintings generated by a language model. You will be given a list with the following triplets:

    1. **Object Name:** The name of the object.
    2. **Original Description Spans:** The text spans from which the object description was generated.
    3. **Generated Description:** The description created by the language model.

    Your task is to evaluate each triplet by checking the generated description based on the following criteria, providing a score (1-5) and a brief justification for each:

    **Evaluation Criteria:**

    - **Factual Accuracy (1-5):**  Does the generated description accurately reflect the information provided in the original description spans? Does it avoid hallucination or the addition of information not present in the spans? (1 = Completely inaccurate, 5 = Perfectly accurate)
    - **Coherence (1-5):** Is the generated description well-written and easy to understand? Does it flow logically and make sense as a complete description? (1 = Incoherent and confusing, 5 = Perfectly coherent and clear)
    - **Grounding Potential (1-5):** How suitable is the generated description for use with a visual grounding model? Does it focus on visual attributes and provide specific details that would help a grounding model locate the object in an image? (1 = Very poor for grounding, 5 = Excellent for grounding)
    - **Completeness (1-5):** Does the description include all the information that is provided in the spans? (1 = Very poor completeness, 5 = Perfect completeness)"""

    user_prompt = """Object Name: hawk

    Original Description Spans:
    - hawk contemplating itself in a mirror
    - the hawk represents the material world
    - just as the bird "watches itself in a glass, waiting for the image to move so as to know which is really alive, itself or the image"
    - painted the bird from an Egyptian carving. Thus the "real" hawk is immobile and the flying hawk in the mirror is an illusion. The carving from which the hawk was painted now adorns the artist's grave.

    Generated Description: A hawk is depicted contemplating itself in a mirror. The hawk represents the material world, and it is described as watching itself in a glass, waiting for the image to move in order to know which is really alive, itself or the image. The artist painted the bird from an Egyptian carving, which makes the "real" hawk immobile while the flying hawk in the mirror is an illusion. The carving from which the hawk was painted now adorns the artist's grave.

    Object Name: spheres

    Original Description Spans:
    - spheres, the soul

    Generated Description:
    - Spheres represent the soul."""


    response = client.beta.chat.completions.parse(
        model="gpt-4.1",
        seed=0,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        response_format=DescriptionScore,
    )

    return response

In [None]:
judge_client = get_judge_llm_client()
response = judge_objects_descriptions(judge_client)

In [None]:
message = response.choices[0].message

if message.parsed:
    extraction_evaluation = dict(response.choices[0].message.parsed)
    pprint(extraction_evaluation)
else:
    print(message.refusal)

### 2. LLM-as-a-judge for extracted objects and their description spans

In [None]:
class DescriptionObjectIssues(BaseModel):
    object_name: str
    explanation: str

In [None]:
class IssueDescription(BaseModel):
    span: str
    explanation: str

In [None]:
class DescriptionSpanIssues(BaseModel):
    object_name: str
    spans_with_issue: list[IssueDescription]

In [None]:
class DescriptionExtractionEvaluation(BaseModel):
    false_positive_objects: list[DescriptionObjectIssues]
    false_negative_objects: list[DescriptionObjectIssues]
    false_positive_objects: list[DescriptionSpanIssues]
    false_negative_objects: list[DescriptionSpanIssues]

In [None]:
class SpanIssues(BaseModel):
    object_name: str
    spans_with_issue: list[str]

In [None]:
class ExtractionEvaluation(BaseModel):
    false_positive_objects: list[str]
    false_negative_objects: list[str]
    false_positive_spans: list[SpanIssues]
    false_negative_spans: list[SpanIssues]

In [None]:
def image_to_url(image_bytes):
    image_base64 = base64.b64encode(image_bytes).decode("utf-8")
    image_url = f"data:image/png;base64,{image_base64}"

    return image_url

In [None]:
def judge_objects_extractions(client):
    system_prompt = """You are an expert art analyst tasked with evaluating the accuracy of object extraction from paintings and their corresponding textual description spans. You will be given the following:
    
    **Input Format**
    1. A painting image
    2. The original textual description of the painting
    3. The AI system's output listing:
        - Objects detected in both the painting and description
        - Corresponding description spans for each object (if available, as an object can be only present in the description without being described)

    **Task**
    Your task is to evaluate each object extracted by the first LLM and check if it is mentioned in the description and appears in the painting. If it is not present in both, add it to the list of false positive. If the object appears in painting and description, but it was not extracted by the first LLM, add it to the list of false negatives. 
    After that, analyze for each object extracted by the first LLM the extracted description spans. They have to be extracted 100% accurately from the initial description. If a description span is not 100% from the description or does not describe the associated object, add it to the list of false positives. If a span describes the associated object, but is not extracted, please add it to the list of false negatives."""

    user_prompt = """Painting description:
    The Venetian playwright Carlo Goldoni praised Longhi\u2019s dedication to truth, by which he meant the artist\u2019s depiction of the foibles of contemporary society. Here a man enters a milliner\u2019s shop not to buy a hat but to elicit sex work from women whose class status has left them vulnerable to wealthier men. Holding out a coin, he negotiates with an older woman for the services of the pretty milliner who pauses her work to read his letter. The reclining mannequin\u2019s head on the table foreshadows the result of their transaction. Longhi\u2019s canvas positions sex work as generational: in the foreground a young girl appraises her doll, paralleling the older woman\u2019s actions.\nPietro Longhi was renowned in Venice for his small paintings representing scenes of everyday life. These were often salacious and depicted love stories, even though no particular narrative was presented and it is unlikely that these works would have had any moral commentary. The paintings often appeared in series and, again, these habitually did not follow a specific theme. Such paintings were avidly collected by patrician families, and the theatrical comic writer Carlo Goldoni praised Longhi\u2019s work.This scene is set in a poor room where seamstresses are meant to be at work\u2014one, in fact, is fast asleep on the left. A pretty girl, dressed in pink and white, is intent in reading a love letter, presumably written by the older gentleman who enters the room from the right. He is offering a coin to the old woman, probably a procuress, who is acting as a business-like intermediary between man and girl. A young girl is innocently playing with her doll as this transaction takes place around her. For a discussion of the theme, see Bagemihl 1988.This painting, together with three others at The Met (14.32.2, 17.190.12, 36.16), is said to have been a part of a larger set of canvases by Longhi. It has been proposed (Volpi 1917) that the artist painted twenty pictures for the Gambardi family in Florence, and that the last member of the family left half of the set to the marchese Freschi in Padua and the other half to the conte Miari de\u2019 Cumani in Padua. Out of the ten Freschi canvases, two are supposed to be in the National Gallery, London (An Interior with Three Women and a Seated Man and The Exhibition of a Rhinoceros in Venice), and two others in the Pinacoteca di Brera, Milan (The Little Concert and The Tooth Puller). These two pairs, however, are different in format from each other. The Met's four paintings, instead, were sold by conte Giacomo Miari de\u2019 Cumani in 1912\u201313. He was said to have owned ten paintings by Longhi, four of which are at The Met, and six of which were in the collection of Elia Volpi in Florence and later in that of Lionello Perera in New York, before being dispersed at two sales: four were sold at Sotheby\u2019s, London, on June 24, 1964 (nos. 31\u201334) and two were sold at Sotheby\u2019s, London, on April 19, 1967 (nos. 18\u201319). Documentation in The Met's archives, however, suggests that conte Miari owned fourteen canvases by Longhi, and not ten. He commissioned copies of the paintings he sold, but these works are still untraced. Four of the Miari canvases were sold to Carlo Balboni who, together with Antonio Carrer, sold them to The Met. Of these four, two (14.32.1 and 14.32.2) were exhibited at the Museum, while two were sold to J. Pierpont Morgan (17.190.12) and Henry Walters (36.16), before eventually returning to the Museum separately. Of the six remaining paintings that were sold in 1964 and 1967, three are the Artist Sketching an Elegant Company (Norton Simon Museum, Pasadena), The Quack Doctor (Los Angeles County Museum of Art), and The Card Players (Alte Pinakothek, Munich). Three more (A Gentleman and his Wife Taking Chocolate, A Musical Party, A Girl Dancing at a Picnic) are currently untraced. The set probably dates around 1746, the date inscribed next to Longhi\u2019s signature on the back of The Visit (14.32.2). All four paintings have matching eighteenth-century Venetian frames, supporting their origin from the same set of canvases.This painting was engraved by Cattini, and another version of it was in the collection of Sir Brinsley Ford in London.Xavier F. Salomon 2011.

    Extracted objects together with their description spans:
        - mannequin: ['The reclining mannequin\t’s head on the table foreshadows the result of their transaction.', 'test paragraph']
        - coin: ['Holding out a coin, he negotiates with an older woman for the services of the pretty milliner who pauses her work to read his letter.', 'He is offering a coin to the old woman, probably a procuress, who is acting as a business-like intermediary between man and girl.']
        - doll: ['in the foreground a young girl appraises her doll, paralleling the older woman’s actions.', 'A young girl is innocently playing with her doll as this transaction takes place around her.']
        - rabbit : ['a nice rabbit']"""

    response = client.beta.chat.completions.parse(
        model="gpt-4.1",
        seed=0,
        temperature=0,
        messages=[
            {"role": "system", "content": system_prompt},
            {
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": image_to_url(image_to_bytes(load_image(723)))}},
                    {"type": "text", "text": user_prompt},
                ],
            },
        ],
        response_format=ExtractionEvaluation,
    )

    return response

In [None]:
judge_client = get_judge_llm_client()
response = judge_objects_extractions(judge_client)

In [None]:
description = "The Venetian playwright Carlo Goldoni praised Longhi\u2019s dedication to truth, by which he meant the artist\u2019s depiction of the foibles of contemporary society. Here a man enters a milliner\u2019s shop not to buy a hat but to elicit sex work from women whose class status has left them vulnerable to wealthier men. Holding out a coin, he negotiates with an older woman for the services of the pretty milliner who pauses her work to read his letter. The reclining mannequin\u2019s head on the table foreshadows the result of their transaction. Longhi\u2019s canvas positions sex work as generational: in the foreground a young girl appraises her doll, paralleling the older woman\u2019s actions.\nPietro Longhi was renowned in Venice for his small paintings representing scenes of everyday life. These were often salacious and depicted love stories, even though no particular narrative was presented and it is unlikely that these works would have had any moral commentary. The paintings often appeared in series and, again, these habitually did not follow a specific theme. Such paintings were avidly collected by patrician families, and the theatrical comic writer Carlo Goldoni praised Longhi\u2019s work.This scene is set in a poor room where seamstresses are meant to be at work\u2014one, in fact, is fast asleep on the left. A pretty girl, dressed in pink and white, is intent in reading a love letter, presumably written by the older gentleman who enters the room from the right. He is offering a coin to the old woman, probably a procuress, who is acting as a business-like intermediary between man and girl. A young girl is innocently playing with her doll as this transaction takes place around her. For a discussion of the theme, see Bagemihl 1988.This painting, together with three others at The Met (14.32.2, 17.190.12, 36.16), is said to have been a part of a larger set of canvases by Longhi. It has been proposed (Volpi 1917) that the artist painted twenty pictures for the Gambardi family in Florence, and that the last member of the family left half of the set to the marchese Freschi in Padua and the other half to the conte Miari de\u2019 Cumani in Padua. Out of the ten Freschi canvases, two are supposed to be in the National Gallery, London (An Interior with Three Women and a Seated Man and The Exhibition of a Rhinoceros in Venice), and two others in the Pinacoteca di Brera, Milan (The Little Concert and The Tooth Puller). These two pairs, however, are different in format from each other. The Met's four paintings, instead, were sold by conte Giacomo Miari de\u2019 Cumani in 1912\u201313. He was said to have owned ten paintings by Longhi, four of which are at The Met, and six of which were in the collection of Elia Volpi in Florence and later in that of Lionello Perera in New York, before being dispersed at two sales: four were sold at Sotheby\u2019s, London, on June 24, 1964 (nos. 31\u201334) and two were sold at Sotheby\u2019s, London, on April 19, 1967 (nos. 18\u201319). Documentation in The Met's archives, however, suggests that conte Miari owned fourteen canvases by Longhi, and not ten. He commissioned copies of the paintings he sold, but these works are still untraced. Four of the Miari canvases were sold to Carlo Balboni who, together with Antonio Carrer, sold them to The Met. Of these four, two (14.32.1 and 14.32.2) were exhibited at the Museum, while two were sold to J. Pierpont Morgan (17.190.12) and Henry Walters (36.16), before eventually returning to the Museum separately. Of the six remaining paintings that were sold in 1964 and 1967, three are the Artist Sketching an Elegant Company (Norton Simon Museum, Pasadena), The Quack Doctor (Los Angeles County Museum of Art), and The Card Players (Alte Pinakothek, Munich). Three more (A Gentleman and his Wife Taking Chocolate, A Musical Party, A Girl Dancing at a Picnic) are currently untraced. The set probably dates around 1746, the date inscribed next to Longhi\u2019s signature on the back of The Visit (14.32.2). All four paintings have matching eighteenth-century Venetian frames, supporting their origin from the same set of canvases.This painting was engraved by Cattini, and another version of it was in the collection of Sir Brinsley Ford in London.Xavier F. Salomon 2011."

In [None]:
def get_object_extraction_fp_fn(extraction_evaluation, description, error_type):
    objects_with_wrong_spans = set([entry.object_name for entry in extraction_evaluation[f"false_{error_type}_spans"]]).difference(set(extraction_evaluation[f"false_{error_type}_objects"]))

    spans = ""
    objects_with_spans = ""

    for object_name in objects_with_wrong_spans:
        for entry in extraction_evaluation[f"false_{error_type}_spans"]:
            spans_with_issue = []

            for span in entry.spans_with_issue:
                # when the judge spots a false negative, make sure the span is in the actual description
                if (error_type == "negative" and span in description) or (error_type == "positive"):
                    spans_with_issue.append(span)           

            if len(spans_with_issue) == 0:
                continue
            
            if entry.object_name == object_name:
                spans += f"- spans ['{"', '".join(spans_with_issue)}'] for object '{entry.object_name}'\n"
            else:
                objects_with_spans += f"- object '{entry.object_name}' together with the spans ['{"', '".join(spans_with_issue)}']\n"

    return spans, objects_with_spans

In [None]:
message = response.choices[0].message

if message.parsed:
    extraction_evaluation = dict(response.choices[0].message.parsed)
    pprint(extraction_evaluation)
else:
    print(message.refusal)

In [None]:
OBJECTS_RECALL_THRESHOLD = 0.5
SPANS_RECALL_THRESHOLD = 0.5

fn_objects_no = len(extraction_evaluation["false_negative_objects"])
objects_recall = 3 / (3 + fn_objects_no)

fn_spans_no = sum([len(false_negative_spans.spans_with_issue) for false_negative_spans in extraction_evaluation["false_negative_spans"]])
spans_recall = 5 / (5 + fn_spans_no)

In [None]:
object_and_spans = {"object_names": ['Toledo', 'cathedral', 'Alcázar', 'Tagus river', 'Alcántara bridge']}

In [None]:
fp_spans = []
tp_spans_no = 0

for fp_entry in extraction_evaluation["false_negative_spans"]:
    fp_spans.extend(fp_entry.spans_with_issue)


for spans in object_and_spans["descriptions_spans"]:
    for span in spans:
        if span not in fp_spans:
            tp_spans_no += 1

In [None]:
judge_suggestions = """An LLM-as-a-Judge has evaluated your previous output, and there are some issues with your object detection and span extraction. Here are the findings:\n\n"""

if len(extraction_evaluation["false_positive_objects"]) > 0 or len(extraction_evaluation["false_positive_spans"]) > 0:
    fp_spans, fp_objects_with_spans = get_object_extraction_fp_fn(extraction_evaluation, description, "positive")

    if len(fp_spans) > 0:
        judge_suggestions += f"False positives spans (spans that were extracted but should have been extracted, although the extracted object is correct):\n{fp_spans}\n"

    if len(fp_objects_with_spans) > 0:
        judge_suggestions += f"False positives objects (objects and their spans that were extracted but should have not been extracted):\n{fp_objects_with_spans}\n"

fn_spans, fn_objects_with_spans = get_object_extraction_fp_fn(extraction_evaluation, description, "negative")

if objects_recall < OBJECTS_RECALL_THRESHOLD and len(fn_objects_with_spans) > 0:
    judge_suggestions += f"False negative objects (objects together with their spans that were not extracted but should be considered):\n{fn_objects_with_spans}\n"

if spans_recall < SPANS_RECALL_THRESHOLD and len(fn_spans) > 0:
    judge_suggestions += f"False negative spans (spans that were not extracted, but should have been extracted):\n{fn_spans}\n"

judge_suggestions += "Please review the painting and description again with these findings in mind. Ensure you capture all textual descriptions of these objects. " +\
"Also, avoid identifying objects that at the same time aren't mentioned in the description and appear in the painting. Keep in mind that the list of findings represent only suggestions and some suggestions might be wrong. " +\
"If you feel confident that some things you extracted initially are correct, but the suggestions indicate something different, please keep the objects / descriptions in the list of extractions."

In [None]:
print(judge_suggestions)