In [5]:
from dotenv import load_dotenv
load_dotenv()

True

In [6]:
SYSTEM_PROMPT = "You are an assistant, which answers questions for the given text and *optional* image as a context. "
"If it's indicated that an image is passed - then pay much more attention to the image to capture text and structure from there."
"Don't use your prior knowledge of the particalar object passed to you. Base your answer on the info given from the user."
"If you can't answet the question based on input, say `no answer`. Don't come up with wage explanations."

TASK_PROMPT_TEXT_INPUT = "Given the context with text description of a diagram, create a short answer for the following question: {}"

TASK_PROMPT_TEXT_IMAGE_INPUT = "Given the context with text description of a diagram and the *diagram image itself*, create a short answer for the following question: {}"

In [7]:
import os
import json
import base64
from pathlib import Path
from tqdm import tqdm

from PIL import Image
from openai import OpenAI
import anthropic


openai_client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
alibaba_client = OpenAI(
    api_key=os.getenv("ALIBABA_API_KEY"),
    base_url="https://dashscope-intl.aliyuncs.com/compatible-mode/v1"
)
anthropic_client = anthropic.Anthropic(api_key=os.getenv('ANTROPIC_API_KEY'))


def load_image(img_fpath):
    image = Image.open(img_fpath)
    with open(img_fpath, 'rb') as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')
    return image, base64_image

    
def build_img_decription(img_data: dict) -> str:
    caption = f"Caption: `{img_data['caption']}`"
    ocr_text = f"Ocr text: `{' '.join([t for t in img_data['imageText']])}`" 
    title = f"Title: `{img_data['title']}`"
    img_descr = "\n".join([caption, ocr_text, title])
    return img_descr


def answer_openai(client: OpenAI, model: str, prompt: str, img_descr: str, base64_image=None):
    content = [
        {"type": "text", "text": prompt},
    ]
    if base64_image:
        content.append({"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}})
    if img_descr:
        content.append({"type": "text", "text": f"Image description: {img_descr}"})

    messages = []
    if not model.startswith("qwen"):
        messages.append(
            {
                "role": "system",
                "content": SYSTEM_PROMPT,
            },
        )
    messages.append(
        {
            "role": "user",
            "content": content,
        }
    )

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=256,
        temperature=0.1,
    )

    response_parsed = response.choices[0].message.content
    return response_parsed


def answer_anthropic(prompt, img_descr, base64_image=None):
    content = [
        {"type": "text", "text": prompt},
    ]
    if base64_image:
        content.append(
            {
                "type": "image", 
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": base64_image,
                }
            }
        )
    if img_descr:
        content.append({"type": "text", "text": f"Image description: {img_descr}"})

    messages = [
        {
            "role": "user",
            "content": content,
        },
    ]
    response = anthropic_client.messages.create(
        model="claude-3-5-sonnet-latest",
        system=SYSTEM_PROMPT,
        messages=messages,
        max_tokens=256,
        temperature=0.1,
    )
    response_parsed = "no answer"
    if len(response.content):
        response_parsed = response.content[0].text
    return response_parsed

In [8]:
base_img_path = Path("./data/images")
with open("./data/03_img_json_with_questions.json", "r") as f:
    data = json.load(f)
img_fnames = list(data.keys())

In [None]:
results = {
    "img_fname": [],
    "question": [],
    "gpt_text": [], 
    "gpt_image": [], 
    "qwen_text": [], 
    "qwen_image": [], 
    "claude_text": [],
    "claude_image": [],
    "answer": [],
}
for img_fname in tqdm(img_fnames):
    results["img_fname"].append(img_fname)

    img_data = data[img_fname]
    img_descr = build_img_decription(img_data)
    qna_pairs = img_data["questions"]
    _, base64_image = load_image(base_img_path.joinpath(img_fname))

    for qna in qna_pairs:
        results["question"].append(qna["question"])
        results["answer"].append(qna["answer"])

        response_gpt_text = answer_openai(
            openai_client,
            "gpt-4o",
            TASK_PROMPT_TEXT_INPUT.format(qna["question"]), 
            img_descr,
        )
        results["gpt_text"].append(response_gpt_text)

        response_gpt_image = answer_openai(
            openai_client,
            "gpt-4o",
            TASK_PROMPT_TEXT_IMAGE_INPUT.format(qna["question"]), 
            img_descr, 
            base64_image,
        )
        results["gpt_image"].append(response_gpt_image)

        response_qwen_text = answer_openai(
            alibaba_client,
            "qwen-vl-max",
            TASK_PROMPT_TEXT_INPUT.format(qna["question"]), 
            img_descr,
        )
        results["qwen_text"].append(response_qwen_text)

        response_qwen_image = answer_openai(
            alibaba_client,
            "qwen-vl-max",
            TASK_PROMPT_TEXT_IMAGE_INPUT.format(qna["question"]), 
            img_descr,
            base64_image,
        )
        results["qwen_image"].append(response_qwen_image)

        response_claude_text = answer_anthropic(
            TASK_PROMPT_TEXT_INPUT.format(qna["question"]), 
            img_descr, 
        )
        results["claude_text"].append(response_claude_text)

        response_claude_image = answer_anthropic(
            TASK_PROMPT_TEXT_IMAGE_INPUT.format(qna["question"]), 
            img_descr, 
            base64_image,
        )
        results["claude_image"].append(response_claude_image)

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
with open("./data/qna_results.json", "w") as f:
    json.dump(results, f)