## Install and Imports

In [None]:
!pip install openai==0.28.1

Collecting openai==0.28.1
  Downloading openai-0.28.1-py3-none-any.whl.metadata (11 kB)
Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m616.8 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.97.1
    Uninstalling openai-1.97.1:
      Successfully uninstalled openai-1.97.1
Successfully installed openai-0.28.1


In [None]:
import json
import nltk
import requests
import random
import openai
import pickle
import base64
import requests
import os
import re
from tqdm import tqdm
import sys
import pandas as pd


sys.path.append(os.path.abspath('../UtilsYF'))
from normal_utils import *


# Run GPT

In [None]:
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
    return encoded_string

In [None]:
def query_gpt_v(image_path,prompt,model="gpt-4o-mini-2024-07-18"):
    # Encode image as Base64
    encoded_image = encode_image(image_path)
    # Construct the messages for the GPT-4o API
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_image}"}},
                {"type": "text", "text": prompt}
            ]
        }
    ]



    response = openai.ChatCompletion.create(
        model=model,  # Adjust the model if needed
        messages=messages,
        max_tokens=2000,
        temperature = 0.0
    )

    return response.choices[0].message.content

In [None]:
benchmark_path="OP_bench/benchmark.json"
benchmark_path_VG="ORBIT_VG_output.json"

with open(benchmark_path_VG, 'r') as f:
  benchmark = json.load(f)

output_file = "4o-mini_temp0_results.json"
output_file_VG = "4o-mini_temp0_VG_results.json"
model_name = "gpt-4o-mini"

In [None]:
def clean_answer(answer):
  pattern = r'(\d+)\s*\[(.*?)\]'
  match = re.search(pattern, answer)

  if match:
      number = match.group(1)
      objects = [obj.strip() for obj in match.group(2).split(',')]
      return {
          "count": number,
          "reasoning": objects
      }
  else:
      # Fallback if format isn't matched
      numbers = re.findall(r'\d+', answer)
      return {
          "count": numbers[0] if numbers else "0",
          "reasoning": []
      }

In [None]:
results = []
# images = benchmark['benchmark']['images']
images = benchmark['images']
for idx, image_data in enumerate(tqdm(images, desc="Processing images")):
  # image_path = "OP_bench/" + image_data['path']
  image_path = image_data['path']
  image_results = []
  for question in image_data['questions']:
    prompt = f"{question['question']} Your response MUST be in the following format and nothing else:\n <NUMBER> [<OBJECT1>, <OBJECT2>, <OBJECT3>, ...]"
    answer = query_gpt_v(image_path, prompt, "gpt-4o-mini-2024-07-18")
    cleaned_answer = clean_answer(answer)


    image_results.append({
                        "image_id": image_data["image_id"],
                        "image_type": image_data["image_type"],
                        "question_id": question["id"],
                        "question": question["question"],
                        "ground_truth": question["answer"],
                        "model_answer": cleaned_answer["count"],
                        "model_reasoning": cleaned_answer["reasoning"],
                        "raw_answer": answer,  # Keep raw answer for debugging
                        "property_category": question["property_category"]
                    })
  results.extend(image_results)

Processing images: 100%|██████████| 44/44 [01:37<00:00,  2.21s/it]


In [None]:
with open(output_file_VG, 'w') as f:
  json.dump(results, f, indent=4)

# Analyse Results

In [None]:
def load_results(file_path):
  with open(file_path, 'r') as f:
      return json.load(f)

In [None]:
def calculate_accuracy(predictions, ground_truths):
  """Calculate accuracy between predictions and ground truths."""
  # Convert Series to lists to avoid pandas Series ambiguity
  if hasattr(predictions, 'tolist'):
      predictions = predictions.tolist()
  if hasattr(ground_truths, 'tolist'):
      ground_truths = ground_truths.tolist()

  # Calculate accuracy
  correct = sum(1 for p, g in zip(predictions, ground_truths) if str(p) == str(g))
  return correct / len(predictions) if predictions else 0

In [None]:
results = load_results(output_file)
df = pd.DataFrame(results)
overall_accuracy = calculate_accuracy(df['model_answer'], df['ground_truth'])
print(f"4o-mini Overall Accuracy: {overall_accuracy:.2%}")

4o-mini Overall Accuracy: 30.37%


In [None]:
results = load_results(output_file_VG)
df = pd.DataFrame(results)
overall_accuracy = calculate_accuracy(df['model_answer'], df['ground_truth'])
print(f"4o-mini Overall Accuracy for orbit-vg: {overall_accuracy:.2%}")

4o-mini Overall Accuracy for orbit-vg: 36.36%


In [None]:
def calculate_error_metrics(predictions, ground_truths, category=None):
    """
    Calculate MAE, MSE, and RMSE between predictions and ground truths.

    Args:
        predictions: List of predicted counts
        ground_truths: List of ground truth counts
        category: Optional category name for the analysis

    Returns:
        Dictionary containing MAE, MSE, and RMSE
    """
    # Convert to numeric values if they're strings
    preds = [int(str(p)) for p in predictions]
    truths = [int(str(g)) for g in ground_truths]

    # Calculate differences
    differences = [abs(p - t) for p, t in zip(preds, truths)]

    # Calculate metrics
    mae = sum(differences) / len(differences)
    mse = sum(d * d for d in differences) / len(differences)
    rmse = mse ** 0.5

    result = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'sample_size': len(differences)
    }

    if category:
        result['category'] = category

    return result

In [None]:
def calculate_off_by_n_accuracy(predictions, ground_truths, n=1):
    """
    Calculate accuracy within n counts of the ground truth.

    Args:
        predictions: List of predicted counts
        ground_truths: List of ground truth counts
        n: Maximum allowed difference (default=1)

    Returns:
        Dictionary containing accuracy for each tolerance level up to n
    """
    # Convert to numeric values if they're strings
    preds = [int(str(p)) for p in predictions]
    truths = [int(str(g)) for g in ground_truths]

    results = {}
    # Cumulative accuracies
    for tolerance in range(n + 1):
        correct = sum(1 for p, t in zip(preds, truths) if abs(p - t) <= tolerance)
        accuracy = correct / len(preds)
        results[f'off_by_{tolerance}'] = accuracy

    # Individual accuracies
    for tolerance in range(n + 1):
        if tolerance == 0:
            results[f'exactly_{tolerance}'] = results[f'off_by_{tolerance}']
        else:
            # Count only predictions that are exactly off by tolerance
            exact_count = sum(1 for p, t in zip(preds, truths) if abs(p - t) == tolerance)
            results[f'exactly_{tolerance}'] = exact_count / len(preds)

    return results

In [None]:
def analyze_error_distribution(predictions, ground_truths):
    """
    Analyze the distribution of counting errors.

    Args:
        predictions: List of predicted counts
        ground_truths: List of ground truth counts

    Returns:
        Dictionary containing error distribution statistics
    """
    # Convert to numeric values if they're strings
    preds = [int(str(p)) for p in predictions]
    truths = [int(str(g)) for g in ground_truths]

    # Calculate errors
    errors = [p - t for p, t in zip(preds, truths)]

    # Calculate statistics
    mean_error = sum(errors) / len(errors)
    median_error = sorted(errors)[len(errors) // 2]

    # Count over/under predictions
    over_count = sum(1 for e in errors if e > 0)
    under_count = sum(1 for e in errors if e < 0)
    exact_count = sum(1 for e in errors if e == 0)

    return {
        'mean_error': mean_error,
        'median_error': median_error,
        'over_count': over_count,
        'under_count': under_count,
        'exact_count': exact_count,
        'error_distribution': errors
    }

In [None]:
def analyze_results(results_file, model_name):
    """Analyze results across different dimensions with additional metrics."""
    results = load_results(results_file)

    # Convert results to DataFrame for easier analysis
    df = pd.DataFrame(results)

    # 1. Overall Accuracy
    overall_accuracy = calculate_accuracy(df['model_answer'], df['ground_truth'])
    print(f"\n{model_name} Overall Accuracy: {overall_accuracy:.2%}")

    # 2. Overall Error Metrics
    overall_error_metrics = calculate_error_metrics(df['model_answer'], df['ground_truth'], 'Overall')
    print("\nOverall Error Metrics:")
    for metric, value in overall_error_metrics.items():
        if metric != 'category' and metric != 'sample_size':
            print(f"{metric}: {value:.2f}")

    # 3. Off-by-N Accuracy (Overall)
    off_by_n = calculate_off_by_n_accuracy(df['model_answer'], df['ground_truth'], n=2)
    print("\nOverall Off-by-N Accuracy:")
    for n, acc in off_by_n.items():
        print(f"{n}: {acc:.2%}")

    # 4. Error Distribution (Overall)
    error_dist = analyze_error_distribution(df['model_answer'], df['ground_truth'])
    print("\nOverall Error Distribution:")
    print(f"Mean Error: {error_dist['mean_error']:.2f}")
    print(f"Median Error: {error_dist['median_error']:.2f}")
    print(f"Over-counts: {error_dist['over_count']}")
    print(f"Under-counts: {error_dist['under_count']}")
    print(f"Exact counts: {error_dist['exact_count']}")

    # 5. Question-type Analysis
    df['question_type'] = df.apply(lambda x: int(x['question_id'].replace('Q', '')), axis=1)
    df['question_category'] = df['question_type'].map({
        1: 'Direct Recognition',
        2: 'Property Inference',
        3: 'Counterfactual'
    })

    question_accuracies = df.groupby('question_category').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Question Type:")
    for q_type, acc in question_accuracies.items():
        print(f"{q_type}: {acc:.2%}")

    # 6. Image Type Analysis
    image_type_accuracies = df.groupby('image_type').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Image Type:")
    for img_type, acc in image_type_accuracies.items():
        print(f"{img_type}: {acc:.2%}")

    # 7. Property Category Analysis
    property_accuracies = df.groupby('property_category').apply(
        lambda x: calculate_accuracy(x['model_answer'], x['ground_truth'])
    )
    print("\nAccuracy by Property Category:")
    for prop, acc in property_accuracies.items():
        print(f"{prop}: {acc:.2%}")


    # Calculate error metrics for each question category
    question_error_metrics = {}
    for q_type in df['question_category'].unique():
        q_df = df[df['question_category'] == q_type]
        question_error_metrics[q_type] = calculate_error_metrics(
            q_df['model_answer'],
            q_df['ground_truth'],
            q_type
        )

    print("\nError Metrics by Question Type:")
    for q_type, metrics in question_error_metrics.items():
        print(f"\n{q_type}:")
        for metric, value in metrics.items():
            if metric != 'category' and metric != 'sample_size':
                print(f"{metric}: {value:.2f}")

    # Similar analysis for image types and property categories
    image_error_metrics = {}
    for i_type in df['image_type'].unique():
        i_df = df[df['image_type'] == i_type]
        image_error_metrics[i_type] = calculate_error_metrics(
            i_df['model_answer'],
            i_df['ground_truth'],
            i_type
        )

    print("\nError Metrics by Image Type:")
    for i_type, metrics in image_error_metrics.items():
        print(f"\n{i_type}:")
        for metric, value in metrics.items():
            if metric != 'category' and metric != 'sample_size':
                print(f"{metric}: {value:.2f}")


    property_error_metrics = {}
    for p_type in df['property_category'].unique():
        p_df = df[df['property_category'] == p_type]
        property_error_metrics[p_type] = calculate_error_metrics(
            p_df['model_answer'],
            p_df['ground_truth'],
            p_type
        )

    print("\nError Metrics by Property caterogry:")
    for p_type, metrics in property_error_metrics.items():
        print(f"\n{p_type}:")
        for metric, value in metrics.items():
            if metric != 'category' and metric != 'sample_size':
                print(f"{metric}: {value:.2f}")

    return {
        'overall': overall_accuracy,
        'overall_error_metrics': overall_error_metrics,
        'question_error_metrics': question_error_metrics,
        'off_by_n': off_by_n,
        'error_distribution': error_dist,
        'question_type': question_accuracies,
        'image_type': image_type_accuracies,
        'property': property_accuracies,
        'df': df
    }

In [None]:
results = analyze_results(output_file, model_name)


gpt-4o-mini Overall Accuracy: 30.37%

Overall Error Metrics:
MAE: 1.94
MSE: 230.88
RMSE: 15.19

Overall Off-by-N Accuracy:
off_by_0: 30.37%
off_by_1: 64.17%
off_by_2: 80.46%
exactly_0: 30.37%
exactly_1: 33.80%
exactly_2: 16.30%

Overall Error Distribution:
Mean Error: -0.36
Median Error: 0.00
Over-counts: 230
Under-counts: 522
Exact counts: 328

Accuracy by Question Type:
Counterfactual: 28.61%
Direct Recognition: 28.89%
Property Inference: 33.61%

Accuracy by Image Type:
AI_GENERATED: 27.78%
ANIMATED: 38.89%
REAL: 24.44%

Accuracy by Property Category:
functional: 25.00%
physical: 21.31%
relational: 37.81%
taxonomic: 34.23%

Error Metrics by Question Type:

Direct Recognition:
MAE: 2.96
MSE: 683.59
RMSE: 26.15

Property Inference:
MAE: 1.30
MSE: 3.92
RMSE: 1.98

Counterfactual:
MAE: 1.56
MSE: 5.11
RMSE: 2.26

Error Metrics by Image Type:

REAL:
MAE: 1.83
MSE: 6.96
RMSE: 2.64

ANIMATED:
MAE: 2.53
MSE: 681.17
RMSE: 26.10

AI_GENERATED:
MAE: 1.47
MSE: 4.49
RMSE: 2.12

Error Metrics by P

  question_accuracies = df.groupby('question_category').apply(
  image_type_accuracies = df.groupby('image_type').apply(
  property_accuracies = df.groupby('property_category').apply(
