In [1]:
import json
gt_qa_path = "/data/aofei/hallucination/Slake/data/training_gt_only_bboxes_abd.json"
gt = json.load(open(gt_qa_path, 'r'))
len(gt), gt[0]

(327,
 {'image': 'xmlab1/source.jpg',
  'id': 3,
  'location': 'Abdomen',
  'conversations': [{'from': 'human',
    'value': '<image>\nDoes the picture contain liver?'},
   {'from': 'gpt', 'value': 'Yes'}],
  'bboxes': [[54.0, 106.0, 30.0, 31.0]]})

In [2]:
def pre_process_gpt(gt_qa):
    data = []
    for item in gt_qa:
        if len(item["bboxes"]) >= 2:
            continue
        data.append({
            "id": item["id"],
            "question": item["conversations"][0]['value'],
            "answers": item["conversations"][1]['value'],
            "bbox": item["bboxes"],
            "image": item["image"]
        })
    return data

prorcessed_gt = pre_process_gpt(gt)

In [3]:
len(prorcessed_gt), prorcessed_gt[0]

(272,
 {'id': 3,
  'question': '<image>\nDoes the picture contain liver?',
  'answers': 'Yes',
  'bbox': [[54.0, 106.0, 30.0, 31.0]],
  'image': 'xmlab1/source.jpg'})

In [4]:
from PIL import Image

def transform_and_normalize_bbox(bbox, image_path):
    # Load the image and get its dimensions
    with Image.open(image_path) as img:
        image_width, image_height = img.size

    # Original coordinates
    top_left_x = bbox[0]
    top_left_y = bbox[1]
    width = bbox[2]
    height = bbox[3]

    # Calculate bottom-right coordinates
    bottom_right_x = top_left_x + width
    bottom_right_y = top_left_y + height

    # Normalize and round coordinates
    normalized_top_left_x = round(top_left_x / image_width, 2)
    normalized_top_left_y = round(top_left_y / image_height, 2)
    normalized_bottom_right_x = round(bottom_right_x / image_width, 2)
    normalized_bottom_right_y = round(bottom_right_y / image_height, 2)

    # Return normalized bounding box
    normalized_bbox = [
        normalized_top_left_x,
        normalized_top_left_y,
        normalized_bottom_right_x,
        normalized_bottom_right_y
    ]
    return normalized_bbox, (image_width, image_height)


In [5]:
import os
for item in prorcessed_gt:
    image_path = os.path.join("/data/aofei/hallucination/Slake/imgs", item["image"])
    item["normalized_bbox"], item["image_size"] = transform_and_normalize_bbox(item["bbox"][0], image_path)

prorcessed_gt[0]

{'id': 3,
 'question': '<image>\nDoes the picture contain liver?',
 'answers': 'Yes',
 'bbox': [[54.0, 106.0, 30.0, 31.0]],
 'image': 'xmlab1/source.jpg',
 'normalized_bbox': [0.21, 0.41, 0.33, 0.54],
 'image_size': (256, 256)}

In [6]:
import os
os.environ["OPENAI_API_KEY"]='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'
from openai import OpenAI
client = OpenAI()

from tqdm import tqdm

def build_vqa_with_gpt4o_in_batches(vqas, batch_size=5):
    new_answers = []

    # Loop through questions, predictions, and ground truths in batches
    for i in tqdm(range(0, len(vqas), batch_size)):
        batch_questions = vqas[i:i+batch_size]
        
        # Create a batch prompt
        batch_prompt = batch_questions
        full_prompt = """
        You are helping me generate a dataset of visual question-answer pairs with bounding box information. Each item should have an answer in a visual grounding style, referring to the specific location of the region within the image.

        For each data item in the format {'id': <ID>, 'question': <QUESTION>, 'answers': <ANSWER>, 'bbox': <BBOX>, 'image': <IMAGE_PATH>, 'normalized_bbox': <NORMALIZED_BBOX>, 'image_size': <IMAGE_SIZE>}, please provide the answer in the following style:

        Describe the location of the object or region using the bounding box coordinates.
        Give an assessment of the object based on the answer provided, if applicable.

        Example item in the Input list:
        {
        'id': 3,
        'question': "<image>\nDoes the liver look normal?",
        'answers': "Yes",
        'bbox': [[54.0, 106.0, 30.0, 31.0]],
        'image': "xmlab1/source.jpg",
        'normalized_bbox': [0.21, 0.41, 0.33, 0.54],
        'image_size': (256, 256)
        }
        Expected new answer:
        {
        "id": 3,
        "new_answer": "The liver is located at bounding box coordinate [0.21, 0.41, 0.33, 0.54], and it shows no abnormality."
        }

        Notice that you should output a list of answers that can be directly parsed by json, with the same order of the given QA list, no extra explanations or extra strings, for each question-answer pair in the batch.
        Here are the old QA pairs that you need to process:
        """

        full_prompt += str(batch_prompt) + "your output of a list of answers, no extra strings:"
        
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an assistant for synthesizing new VQAs."},
                {
                    "role": "user",
                    "content": full_prompt
                }
            ]
        )

        # print(completion.choices[0].message)
        # Parse scores from response
        vqa_text = completion.choices[0].message.content.strip()
        
        # Convert response into a list of scores, assuming each score is on a new line
        try:
            batch_answers = json.loads(vqa_text)
            new_answers.extend(batch_answers)
        except ValueError:
            print(f"Unexpected response format: {vqa_text}")

    return new_answers

In [7]:
import random
sampled_gt = random.sample(prorcessed_gt, 100)
new_answers = build_vqa_with_gpt4o_in_batches(sampled_gt, batch_size=5)

  0%|          | 0/20 [00:00<?, ?it/s]

100%|██████████| 20/20 [01:27<00:00,  4.35s/it]


In [10]:
new_answers[50:60]

[{'id': 1705,
  'new_answer': 'The liver is located at bounding box coordinate [0.24, 0.33, 0.55, 0.63], and it is present in the image.'},
 {'id': 4053,
  'new_answer': 'The spinal cord is located at bounding box coordinate [0.46, 0.56, 0.49, 0.58], and it is present in the image.'},
 {'id': 852,
  'new_answer': 'The liver is located at bounding box coordinate [0.11, 0.28, 0.25, 0.63], and it is situated on the left side of the image.'},
 {'id': 961,
  'new_answer': 'The liver is located at bounding box coordinate [0.12, 0.13, 0.58, 0.6], and it is present in the image.'},
 {'id': 4696,
  'new_answer': 'The liver is located at bounding box coordinate [0.15, 0.37, 0.32, 0.59], and it is present in the image.'},
 {'id': 4220,
  'new_answer': 'The colon is located at bounding box coordinate [0.4, 0.29, 0.75, 0.46], and it is dark gray in color.'},
 {'id': 3928,
  'new_answer': 'The liver is located at bounding box coordinate [0.21, 0.29, 0.52, 0.61], and it is present in the image.'},
 {

In [11]:
# prorcessed_gt[:10]
sampled_gt[0]

{'id': 855,
 'question': '<image>\nWhich is bigger in this image, kidney or liver?',
 'answers': 'Liver',
 'bbox': [[57.0, 142.0, 71.0, 182.0]],
 'image': 'xmlab214/source.jpg',
 'normalized_bbox': [0.11, 0.28, 0.25, 0.63],
 'image_size': (512, 512)}

In [14]:
dict_sampled_gt = {item["id"]: item for item in sampled_gt}
for item in new_answers:
    dict_sampled_gt[item["id"]]["answers"] = item["new_answer"]

In [15]:
new_sampled_gt = list(dict_sampled_gt.values())

In [19]:
new_sampled_gt[1]

{'id': 1746,
 'question': '<image>\nDoes the picture contain liver?',
 'answers': 'The liver is located at bounding box coordinate [0.14, 0.32, 0.4, 0.65], and it is present in the image.',
 'bbox': [[71.0, 164.0, 136.0, 169.0]],
 'image': 'xmlab290/source.jpg',
 'normalized_bbox': [0.14, 0.32, 0.4, 0.65],
 'image_size': (512, 512),
 'new_answer': 'The liver is located at bounding box coordinate [0.14, 0.32, 0.4, 0.65], and it is present in the image.'}

In [22]:
new_train_data = []
for i in new_sampled_gt:
    template = dict()
    
    # template['answer_type'] = i['answer_type']
    template['image'] = i['image']
    template['id'] = i['id']
    template['conversations'] = []
    # template['text'] = i['question']

    new_qa = {"from": "human", "value": i['question']}
    new_qa2 = {"from": "gpt", "value": str(i['answers'])}
    template['conversations'] += [new_qa, new_qa2]
    # template['bboxes'] = []
    # template['bboxes_dict'] = dict()
    # for box_dict in image_name_segments_dict[i['img_name']]:
    #     k, bbox = list(box_dict.items())[0]
    #     if k.lower() in i['question'].lower():
    #         template['bboxes'].append(bbox)
    #         template['bboxes_dict'][k.lower()] = bbox
    new_train_data.append(template)

In [23]:
with open("/data/aofei/hallucination/Slake/data/probing/sampled_gt.json", 'w') as f:
    json.dump(new_train_data, f, indent=4)