In [None]:
!pip install openai

In [37]:
import os
import pandas as pd
import openai
from tqdm import tqdm
from PIL import Image
import base64
import io
import time
import json

df = pd.read_csv("generated_captions.csv", encoding="utf-8-sig")
df = df.drop_duplicates(subset=["image"])
df["art_type"] = df["image"].apply(lambda x: x.split("/")[1] if "/" in x else "unknown")

# taking 1 sample from every art type
initial_df = df.groupby("art_type", group_keys=False).apply(
    lambda g: g.sample(n=1, random_state=42)
).reset_index(drop=True)

# computing how many more samples we need
remaining_n = 3000 - len(initial_df)  # now aiming for 3000

# removing already selected rows
remaining_pool = df[~df["image"].isin(initial_df["image"])]

# sampling the rest of the art types proportionally
proportional_sample = remaining_pool.sample(n=remaining_n, random_state=42)

# combining and shuffling
df = pd.concat([initial_df, proportional_sample], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Sampled exactly {len(df)} examples across {df['art_type'].nunique()} art types.")
print(df['art_type'].value_counts())

Sampled exactly 3000 examples across 27 art types.
art_type
Impressionism                 448
Realism                       364
Romanticism                   265
Expressionism                 260
Baroque                       217
Post_Impressionism            202
Symbolism                     167
Art_Nouveau_Modern            156
Abstract_Expressionism        112
Northern_Renaissance           97
Cubism                         86
Naive_Art_Primitivism          80
Color_Field_Painting           65
Pop_Art                        65
High_Renaissance               64
Rococo                         55
Minimalism                     54
Mannerism_Late_Renaissance     49
Fauvism                        43
Ukiyo_e                        41
Early_Renaissance              41
Pointillism                    21
Contemporary_Realism           18
New_Realism                    16
Synthetic_Cubism                6
Analytical_Cubism               5
Action_painting                 3
Name: count, dtype: in

  initial_df = df.groupby("art_type", group_keys=False).apply(


In [None]:
import os
import pandas as pd
import openai
from tqdm import tqdm
from PIL import Image
import base64
import io
import time
import json

client = openai.OpenAI(api_key="OPENAI_API_KEY")

image_base = "../../../../ibex/ai/home/saitaa0b/"

results = []

#sturctured output
functions = [
    {
        "name": "evaluate_captions",
        "description": "Evaluate two Arabic captions based on three criteria and return structured scores.",
        "parameters": {
            "type": "object",
            "properties": {
                "scores_a": {
                    "type": "object",
                    "properties": {
                        "emotional": {"type": "integer"},
                        "semantic": {"type": "integer"},
                        "grammar": {"type": "integer"}
                    },
                    "required": ["emotional", "semantic", "grammar"]
                },
                "scores_b": {
                    "type": "object",
                    "properties": {
                        "emotional": {"type": "integer"},
                        "semantic": {"type": "integer"},
                        "grammar": {"type": "integer"}
                    },
                    "required": ["emotional", "semantic", "grammar"]
                },
                "winner": {
                    "type": "string",
                    "enum": ["A", "B"]
                },
                "justification": {
                    "type": "string"
                }
            },
            "required": ["scores_a", "scores_b", "winner", "justification"]
        }
    }
]

# converting image to base64 so it can be read by 4o
def encode_image(image_path):
    with Image.open(image_path).convert("RGB") as img:
        buffered = io.BytesIO()
        img.save(buffered, format="JPEG")
        return base64.b64encode(buffered.getvalue()).decode()

def gpt4o_judge_structured(sample):
    encoded = encode_image(sample["image_path"])

    messages = [
        {
            "role": "system",
            "content": "You are a multilingual vision-language expert evaluating Arabic image captions."
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {"url": f"data:image/jpeg;base64,{encoded}"}
                },
                {
                    "type": "text",
                    "text": f"""Evaluate the following two Arabic captions based on:
1. Emotional Expressiveness - Does the caption evoke emotion, artistic depth, or poetic tone beyond basic factual description? High scores go to captions that introduce new, emotionally powerful or poetic words.
2. Semantic Accuracy - How well does the caption reflect the actual visual content of the image?
3. Grammatical Correctness - Is the caption grammatically correct, coherent, and well-written in Arabic?

Give each a score from 1 to 5 for each criterion.

Then return:
- scores_a (caption A),
- scores_b (caption B),
- winner (A or B based on average),
- justification.

Caption A (ALLAM): {sample['pred_allam']}
Caption B (ArtELingo): {sample['pred_artelingo']}"""
                }
            ]
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        functions=functions,
        function_call={"name": "evaluate_captions"},
        temperature=0
    )

    function_args = response.choices[0].message.function_call.arguments
    return json.loads(function_args)

for i in tqdm(range(len(df)), desc="Evaluating with structured GPT-4o"):
    row = df.iloc[i]
    image_path = os.path.join(image_base, row["image"])
    if not os.path.exists(image_path):
        print(f"Missing image: {image_path}")
        continue

    sample = {
        "image_path": image_path,
        "pred_allam": row["pred_allam"],
        "pred_artelingo": row["pred_artelingo"],
        "art_type": row["art_type"]
    }

    try:
        result = gpt4o_judge_structured(sample)
    
        result_row = {
            "image": os.path.basename(image_path),
            "art_type": sample["art_type"],
            "pred_allam": sample["pred_allam"],
            "pred_artelingo": sample["pred_artelingo"],
            "winner": result["winner"],
            "justification": result["justification"],
            "allam_emotional": result["scores_a"]["emotional"],
            "allam_semantic": result["scores_a"]["semantic"],
            "allam_grammar": result["scores_a"]["grammar"],
            "artelingo_emotional": result["scores_b"]["emotional"],
            "artelingo_semantic": result["scores_b"]["semantic"],
            "artelingo_grammar": result["scores_b"]["grammar"]
        }
    
        results.append(result_row)
    
        # saving every 100 samples or on the last one to not have to run all over again if something goes wrong
        if i % 100 == 0 or i == len(df) - 1:
            pd.DataFrame(results).to_csv("gpt4o_structured_judgments_temp.csv", index=False, encoding="utf-8-sig")
    
    except Exception as e:
        print(f"Error on image {row['image']}: {e}")
        time.sleep(10)
        continue

results_df = pd.DataFrame(results)
results_df.to_csv("gpt4o_structured_judgments.csv", index=False, encoding="utf-8-sig")
print("Saved final structured results to 'gpt4o_structured_judgments.csv'")

Evaluating with structured GPT-4o:   0%|          | 15/3000 [01:49<5:13:10,  6.29s/it]