# Judge annotations
This notebook is used for the development of the LLM-as-a-judge.

### 0. Import libraries

In [None]:
import os
import json
from pprint import pprint

from openai import OpenAI
from pydantic import BaseModel

### 1. LLM-as-a-judge for the object descriptions

In [None]:
class ScoreExplanation(BaseModel):
    score : int
    explanation : str

In [None]:
class DescriptionScoreEvaluation(BaseModel):
    factual_accuracy: ScoreExplanation
    coherence: ScoreExplanation
    grounding_potential : ScoreExplanation
    completeness : ScoreExplanation

In [None]:
class DescriptionScore(BaseModel):
    factual_accuracy: int
    coherence: int
    grounding_potential : int
    completeness : int

In [None]:
def get_judge_llm_client():
    with open("../../config/keys.json", "r") as file:
        os.environ['OPENAI_API_KEY'] = json.load(file)["openai_api_key"]

    return OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [None]:
def judge_output(client):
    system_prompt = """You are an expert evaluator assessing the quality of object descriptions from paintings generated by a language model. You will be given the following:

    1.  **Object Name:** The name of the object.
    2.  **Original Description Spans:** The text spans from which the object description was generated.
    3.  **Generated Description:** The description created by the language model.

    Your task is to evaluate the generated description based on the following criteria, providing a score (1-5) and a brief justification for each:

    **Evaluation Criteria:**

    *   **Factual Accuracy (1-5):**  Does the generated description accurately reflect the information provided in the original description spans? Does it avoid hallucination or the addition of information not present in the spans? (1 = Completely inaccurate, 5 = Perfectly accurate)
    *   **Coherence (1-5):** Is the generated description well-written and easy to understand? Does it flow logically and make sense as a complete description? (1 = Incoherent and confusing, 5 = Perfectly coherent and clear)
    *   **Grounding Potential (1-5):** How suitable is the generated description for use with a visual grounding model? Does it focus on visual attributes and provide specific details that would help a grounding model locate the object in an image? (1 = Very poor for grounding, 5 = Excellent for grounding)
    *   **Completeness (1-5):** Does the description include all the information that is provided in the spans? (1 = Very poor completeness, 5 = Perfect completeness)"""

    user_prompt = """Object Name: hawk

    Original Description Spans:
    - hawk contemplating itself in a mirror
    - the hawk represents the material world
    - just as the bird "watches itself in a glass, waiting for the image to move so as to know which is really alive, itself or the image"
    - painted the bird from an Egyptian carving. Thus the "real" hawk is immobile and the flying hawk in the mirror is an illusion. The carving from which the hawk was painted now adorns the artist's grave.

    Generated Description: A hawk is depicted contemplating itself in a mirror. The hawk represents the material world, and it is described as watching itself in a glass, waiting for the image to move in order to know which is really alive, itself or the image. The artist painted the bird from an Egyptian carving, which makes the "real" hawk immobile while the flying hawk in the mirror is an illusion. The carving from which the hawk was painted now adorns the artist's grave."""

    response =  client.beta.chat.completions.parse(
        model="gpt-4.1-nano",
        seed=0,
        temperature=0,
        messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ],
        response_format=DescriptionScore,
    )

    return response

In [None]:
judge_client = get_judge_llm_client()
response = judge_output(judge_client)

In [None]:
message = response.choices[0].message

if message.parsed:
    description_evaluation = dict(response.choices[0].message.parsed)
    pprint(description_evaluation)
else:
    print(message.refusal)