In [1]:
%load_ext autoreload
%autoreload 2

In [5]:
from langchain_openai import ChatOpenAI
from datasets import load_dataset
from langchain_core.prompts import PromptTemplate
import re
import json
import random
from tqdm.notebook import tqdm
import pandas as pd

In [11]:
model_name = "gpt-4o-mini"
llm = ChatOpenAI(model=model_name, temperature=0.01)

In [6]:
dataset = load_dataset("Dogdays/clevr_subset", token=True)["train"]

In [7]:
PROMPT_TEMPLATE = """You are a careful assistant helping a visually impaired person to answer questions regarding a scene. The scene is described in JSON format.

Scene: {scene}

First examine the scene and then check the following question. Reason about how this question can be answered before provide the final answer. The answer should either be a number, yes or no or an attribute value.

Give the final answer in the following JSON format:
```json
{{
    "answer": "<answer to the question>"
}}
```

Question: {question}
"""

In [8]:
prompt_template = PromptTemplate.from_template(PROMPT_TEMPLATE)

In [9]:
def extract_json(text):
    json_pattern = re.compile(r"```json\n((.|\n)*?)\n```")

    match = json_pattern.search(text)
    if match:
        return json.loads(match.group(1))
    else:
        return None

In [10]:
chain = prompt_template | llm
results = []

for sample in tqdm(dataset):
    scene = sample["scene"]
    question = sample["question"]
    result = chain.invoke(input={
        "scene": scene,
        "question": question
    })

    answer = None
    while answer is None:
        try:
            answer = extract_json(result.content)["answer"]
        except json.JSONDecodeError:
            print("JSON Decode Error")
            continue
    results.append(answer)

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [31]:
answers = [question["answer"] for question in questions]

In [32]:
df = pd.DataFrame({
    "predicted": results,
    "actual": answers
})

df.to_csv(f"direct_prompt_results_{model_name}.csv", index=False)

In [33]:
correct = [1 if str(p) == str(a) else 0 for p, a in zip(results, answers)]
print(len(correct), sum(correct), sum(correct) / len(correct))

100 91 0.91


In [10]:
num_objects = []
for scene in scenes:
    num_objects.append(len(scene["objects"]))

In [10]:
intervals = [range(3, 6), range(6, 9), range(9, 11)]

In [13]:
def get_questions(questions, scenes, objects_interval, count=20):
    filtered_questions = []

    for question in questions:
        scene = scenes[question["image_index"]]
        num_objects = len(scene["objects"])
        
        if num_objects in objects_interval:
            filtered_questions.append(question)
        
        if len(filtered_questions) >= count:
            break
    
    return filtered_questions

In [14]:
chain = prompt_template | llm
results = []

for i in range(3):
    filtered_questions = get_questions(questions, scenes, intervals[i], count=20)
    correct = 0
    for question in tqdm(filtered_questions):
        scene = scenes[question["image_index"]]
        process_scene(scene)
        result = chain.invoke(input={
            "scene": scene,
            "question": question["question"]
        })

        try:
            answer = extract_json(result.content)["answer"]
        except json.JSONDecodeError:
            continue
        if  question["answer"].lower() in answer.lower():
            correct += 1
    results.append(correct)


  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
for i in range(3):
    print(f"Accuracy for {intervals[i]} objects: {results[i] / 20 * 100:.2f}%")

Accuracy for range(3, 6) objects: 80.00%
Accuracy for range(6, 9) objects: 85.00%
Accuracy for range(9, 11) objects: 85.00%


In [10]:
def get_question_by_category(questions, category, count=20):
    filtered_questions = []

    for question in questions:
        if category == "count" and question["answer"].isdigit():
            filtered_questions.append(question)
        elif category == "judge" and question["answer"].lower() in ["yes", "no"]:
            filtered_questions.append(question)
        elif category == "query" and question["answer"].lower() not in ["yes", "no"] and not question["answer"].isdigit():
            filtered_questions.append(question)
        
        if len(filtered_questions) >= count:
            break
    
    return filtered_questions

In [15]:
chain = prompt_template | llm

results = []
answers = []
for category in ["count", "judge", "query"]:
    filtered_questions = get_question_by_category(questions, category, count=20)
    correct = 0
    ans = []
    for question in tqdm(filtered_questions):
        scene = scenes[question["image_index"]]
        process_scene(scene)
        result = chain.invoke(input={
            "scene": scene,
            "question": question["question"]
        })

        try:
            answer = extract_json(result.content)["answer"]
        except json.JSONDecodeError:
            continue
        if  str(question["answer"]).lower() in answer.lower():
            correct += 1
        ans.append((question["answer"], answer))
    answers.append(ans)
    results.append(correct)

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

In [16]:
for i, category in enumerate(["count", "judge", "query"]):
    print(f"Accuracy for {category} questions: {results[i] / 20 * 100:.2f}%")

Accuracy for count questions: 90.00%
Accuracy for judge questions: 85.00%
Accuracy for query questions: 70.00%


In [35]:
model_name = "gpt-4o-mini"
df = pd.read_csv(f"direct_prompt_results_{model_name}.csv")

print(len(df), sum(df["predicted"] == df["actual"]), sum(df["predicted"] == df["actual"]) / len(df))

100 82 0.82


Performance of directly question answering using the JSON object
* 4o-mini
    * Clevr human: COT exact match: 13 / 25
    * Clevr Val: 18 / 25
* 4o
    * Clevr human: COT exact match: 18 / 25
    * Clevr Val: 21 / 25