In [None]:
"""
@Description :   The 4th step of main.py, where LLM & VLM & Detectron are implemented
@Author      :   Yan Ding 
@Time        :   2023/09/03 21:54:15
"""

In [None]:
# install dependencies
!pip install openai==0.27.0
!pip install inflect
!pip install salesforce-lavis
!pip install Pillow
!pip install accelerate
!pip install bitsandbytes
!python -m pip install pyyaml==5.1
!pip install transformers==4.33.2
!python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'

import torch
import os
import math
import numpy as np
import inflect
import sys
import openai
from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import requests
import json, cv2, random
import distutils.core
from detectron2.utils.logger import setup_logger
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer
from detectron2.data import MetadataCatalog, DatasetCatalog
from datetime import datetime

# Step 4.0.1: Load VLM models

In [None]:
CONFIG = {
    "api_key": "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",
    "gpt_model": "text-davinci-003",
    "model_name": "Salesforce/instructblip-flan-t5-xxl"
}
# /root/.cache/huggingface/hub/models--Salesforce--blip2-flan-t5-xxl
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
processor = InstructBlipProcessor.from_pretrained(CONFIG["model_name"])
model = InstructBlipForConditionalGeneration.from_pretrained(CONFIG["model_name"], torch_dtype=torch.float16).to(device)

# Step 4.0.2: Load object detection models

In [None]:
model_name = "LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x.yaml"
cfg = get_cfg()
cfg.merge_from_file(model_zoo.get_config_file(model_name))
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.3  # set threshold
cfg.MODEL.WEIGHTS = model_zoo.get_checkpoint_url(model_name)
predictor = DefaultPredictor(cfg)

# Step 4.1: Configuration and assumptions

In [75]:
openai.api_key = CONFIG["api_key"]

# We have an assumption that we know object categories, and exact object name
object_groups = {
    'fruit_names': ["banana", "apple", "orange", "avocado", "lemon"],
    'food_names': [], # TODO: "potato_chip", "pastry", "pie"
    'beverage_names': [], # TODO: 'beer'
    'utensil_names': ['knife', 'fork', 'spoon'],
    'container_names': ['bowl', 'cup', 'plate'], # TODO: 'mug'
    'table_name': ["counter", "table"],
    'cabinet_name': ["cabinet"],
    'appliance_names': ['fridge', 'microwave', 'toaster'] # TODO: 'faucet'
}
object_names = sum([object_groups[key] for key in object_groups], [])
locations = container_names + table_name + cabinet_name + appliance_names # location type
predicates = {
    'fruit_names': ['on', 'in'],
    'food_names': ['on', 'in'],
    'beverage_names': ['on', 'in'],
    'utensil_names': ['on', 'in'],
    'container_names': ['on', 'in'], # TODO: empty
    'table_name': [],
    'cabinet_name': ['open'],
    'appliance_names': ['open']
}

all_predicate_based_prompts = [] # predefined all prompts based on PDDL
for obj_group, obj_list in object_groups.items():
    for obj in obj_list:
        for loc in locations:
            if obj != loc:  # Prevent combinations like 'cup in cup'
                if 'on' in predicates[obj_group] and loc in table_name: # remove 'in table'
                    all_predicate_based_prompts.append([f"is {obj} on {loc} in this image?", f"{obj}", f"{loc}"])
                if 'in' in predicates[obj_group] and loc not in table_name and loc not in obj_list:  # remove 'in counter'
                    all_predicate_based_prompts.append([f"is {obj} in {loc} in this image?", f"{obj}", f"{loc}"])
        if 'empty' in predicates[obj_group]:
            all_predicate_based_prompts.append([f"is {obj} empty in this image?", f"{obj}"])
        if 'open' in predicates[obj_group]:
            all_predicate_based_prompts.append([f"is {obj} open in this image?", f"{obj}"])

# Create a synonym or mapping dictionary.
synonyms_dict = {
    "refrigerator freezer": "fridge",
    "refrigerator": "fridge",
    "freezer": "fridge",
    "microwave oven": "microwave",
    "orange_(fruit)": "orange",
    "apple_(fruit)": "apple",
    "banana_(fruit)": "banana",
}

# Step 4.2: Utils functions

In [79]:
def find_image_path(image_name, case_id):
    """
    Helper function to find the image path for images"
    """
    for i in range(1, 100):
        for ext in ['.jpg', '.png']:
            temp = f"{image_name}_{i}"
            image_path = f'/workspace/input/case{case_id}/{temp}{ext}'
            if os.path.exists(image_path):
                # print(f'!debug exist image_path:{image_path}')
                return image_path
    return None


def vlm_instructblip(image_path, prompt, enable_display=False):
    """
    This function queries a Vision-Language Model (VLM) for a given image name and prompt.
    This function deos not need need history.
    
    Args:
        image_name: Name of the image file without extension. The function will attempt to find both .jpg and .png extensions.
        prompt: The text prompt/question that you want to ask the VLM about the image.
    """
    raw_image = Image.open(image_path).convert('RGB')
    if enable_display:
        display(raw_image.resize((300, 200)))
    inputs = processor(raw_image, prompt, return_tensors="pt").to(device, torch.float16)
    outputs = model.generate(**inputs)
    # outputs = model.generate(
    #     **inputs,
    #     do_sample=True,
    #     max_length=512,
    #     min_length=1,
    #     top_p=1.0,
    #     repetition_penalty=1,
    #     length_penalty=1.0,
    #     temperature=0.75,
    # )
    response = processor.decode(outputs[0], skip_special_tokens=True)
    
    return response
    

def convert_prompt_to_format(initial_state):
    """
    Convert the initial state prompts into a structured format based on their content.
    For example, a prompt like 'is spoon on counter?' --> ['on', 'spoon', 'counter']
    
    Args:
        initial_state (dict): A dictionary where the keys are camera identifiers (or similar) and the values 
                            are lists of prompts related to that camera.
    """
    def parse_prompt(prompt):
        words = prompt.split()
        # Check for 'on' or 'in' predicate
        if ' not open ' in prompt:
            return ['not open', words[words.index('not') - 1]]
        elif ' open ' in prompt:
            return ['open', words[1]]
        elif ' on ' in prompt:
            return ['on', words[1], words[3].replace('?', '')]
        elif ' in ' in prompt:
            return ['in', words[1], words[3].replace('?', '')]
        else:
            # If the prompt doesn't match any of the known formats, return the original prompt as a single string
            return [prompt]

    formatted_state = {}
    for camera, prompts in initial_state.items():
        formatted_prompts = [parse_prompt(prompt) for prompt in prompts]
        formatted_state[camera] = formatted_prompts

    return formatted_state


def llm(prompt):
    sampling_params = {"n": 1,  # sampling number
                      "max_tokens": 32,
                      "temperature": 0,
                      "top_p": 1,
                      "logprobs": 1,
                      "presence_penalty": 0,
                      "frequency_penalty": 0,
                      "stop": ['\\n', '.']}
    raw_response = openai.Completion.create(engine=CONFIG["gpt_model"], prompt=prompt, **sampling_params)
    responses = [raw_response['choices'][i]['text'] for i in range(sampling_params['n'])]
    mean_probs = [math.exp(np.mean(raw_response['choices'][i]['logprobs']['token_logprobs'])) for i in range(sampling_params['n'])]
    responses = [sample.strip().lower() for sample in responses]
    return responses, mean_probs

    
def query_llm_for_objects(response):
    task = "The task is to extract object names from the sentence, excluding adjectives."
    example1 = "Input: There is a red apple and a green bowl. Output: apple, bowl"
    example2 = "Input: There is two ovens on the counter. Output: oven, counter"
    question = f"Input: {response}"
    prompt = f'{task} \n {example1} \n {example2} \n {question}.'
    response, _ = llm(prompt)

    print(f'!debug: raw response from LLm is {response}')
          
    items = response[0].split(':')[-1]
    items_list = [item.strip() for item in items.split(',') if item.strip()]
    normalized_items = [synonyms_dict.get(item, item) for item in items_list]

    # print(f'!debug: normalized_items is {normalized_items}')

    p = inflect.engine()
    def to_singular(item):
        return p.singular_noun(item) or item
    
    return [to_singular(item) for item in normalized_items if any(to_singular(item) in obj_name for obj_name in object_names)]


def get_objects(response_set):
    items_list = [item.strip() for item in response_set]
    normalized_items = [synonyms_dict.get(item, item) for item in items_list]

    p = inflect.engine()

    def to_singular(item):
        return p.singular_noun(item) or item
    
    return [to_singular(item) for item in normalized_items if any(to_singular(item) in obj_name for obj_name in object_names)]


def select_prompts(objects):
    return [
        sentence[0] 
        for sentence in all_predicate_based_prompts 
        if (len(sentence) == 3 and sentence[1] in objects and sentence[2] in objects) or
           (len(sentence) != 3 and sentence[1] in objects)
    ]

    
def detect_object(image_path, enable_vlm=False, enable_display=False):
    if not enable_vlm:
        raw_image = cv2.imread(image_path)
        # Get the dataset's metadata to map class IDs to class names
        metadata = MetadataCatalog.get(cfg.DATASETS.TRAIN[0])
        class_names = metadata.get("thing_classes", None)
        outputs = predictor(raw_image)
        v = Visualizer(raw_image[:, :, ::-1], MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), scale=1.2)
        output = v.draw_instance_predictions(outputs["instances"].to("cpu"))
        instances = outputs["instances"].to("cpu")
        boxes = instances.pred_boxes.tensor.numpy()
        class_ids = instances.pred_classes.numpy()
        if enable_display:
            pillow_image = Image.fromarray(cv2.cvtColor(output.get_image(), cv2.COLOR_BGR2RGB))
            pillow_image.show()
        unique_class_names = set()
        for class_id in class_ids:
            class_name = class_names[class_id] if class_names else str(class_id)
            unique_class_names.add(class_name)
        return unique_class_names
    else:
        unique_class_names = set()
        for obj_name in object_names:
            prompt = f'is there {obj_name} in this image?'
            raw_image = Image.open(image_path).convert('RGB')
            if enable_display:
                display(raw_image.resize((300, 200)))
            inputs = processor(raw_image, prompt, return_tensors="pt").to(device, torch.float16)
            outputs = model.generate(**inputs)
            # outputs = model.generate(
            #     **inputs,
            #     do_sample=True,
            #     max_length=512,
            #     min_length=1,
            #     top_p=1.0,
            #     repetition_penalty=1,
            #     length_penalty=1.0,
            #     temperature=1.0,
            # )
            response = processor.decode(outputs[0], skip_special_tokens=True)
            # print(f'prompt:{prompt} response:{response}')
            
            if 'yes' in response:
                unique_class_names.add(obj_name)
        return unique_class_names
        


def insert_not_before_open(sentence):
    open_index = sentence.find("open")
    if open_index != -1:
        return sentence[:open_index] + "not " + sentence[open_index:]
    else:
        return sentence

# Repeatly run scripts

In [85]:
for case_id in range(1, 53):
    
    # Step 4.3.1: Do object detection using Detectron2
    raw_responses = {}  # store responses from detectron
    for image_name in ["view_2_object_on_table", "view_3_microwave", "view_4_cabinet", "view_5_fridge"]:
        image_path = find_image_path(image_name, case_id)
        if image_path is not None:
            response = detect_object(image_path, enable_vlm=False, enable_display=False)
            raw_responses[image_name] = response
        else:
            continue
    print(f'Object names: {raw_responses}')

    # Step 4.3.2: Do object detection using VLM
    # raw_responses = {}  # store responses from VLM
    # for image_name in ["view_2_object_on_table", "view_3_microwave", "view_4_cabinet", "view_5_fridge"]:
    #     image_path = find_image_path(image_name, case_id)
    #     if image_path is not None:
    #         response = detect_object(image_path, enable_vlm=True, enable_display=False)
    #         raw_responses[image_name] = response
    #     else:
    #         continue
    # print(f'Object names: {raw_responses}')
    
    # Step 4.4.1: Only store known objects
    extracted_objects = {}
    for image_name, response in raw_responses.items():
        extracted_objects[image_name] = get_objects(response)
        print('Objects in image {}: {}'.format(image_name, extracted_objects[image_name]))
    
    # TODO: post-process image using reasonable methods
    # step 2: if view_2_object_on_table does not have table, add it
    if 'table' not in extracted_objects["view_2_object_on_table"]:
        extracted_objects["view_2_object_on_table"].append('table')
        print('step 2')
    # step 3: if view_3_microwave does not have microwave, add it
    if 'microwave' not in extracted_objects["view_3_microwave"]:
        extracted_objects["view_3_microwave"].append('microwave')
        print('step 3.1')
    if 'cabinet' in extracted_objects["view_3_microwave"]:
        extracted_objects["view_3_microwave"].remove('cabinet')
        print('step 3.2')
    if 'fridge' in extracted_objects["view_3_microwave"]:
        extracted_objects["view_3_microwave"].remove('fridge')
        print('step 3.3')
    # step 4: if view_4_cabinet does not have cabinet, add it
    if 'cabinet' not in extracted_objects["view_4_cabinet"]:
        extracted_objects["view_4_cabinet"].append('cabinet')
        print('step 4.1')
    if 'microwave' in extracted_objects["view_4_cabinet"]:
        extracted_objects["view_4_cabinet"].remove('microwave')
        print('step 4.2')
    if 'fridge' in extracted_objects["view_4_cabinet"]:
        extracted_objects["view_4_cabinet"].remove('fridge')
        print('step 4.3')
    # step 5: if view_5_fridge does not have fridge, add it
    if 'fridge' not in extracted_objects["view_5_fridge"]:
        extracted_objects["view_5_fridge"].append('fridge')
        print('step 5.1')
    if 'microwave' in extracted_objects["view_5_fridge"]:
        extracted_objects["view_5_fridge"].remove('microwave')
        print('step 5.2')
    if 'cabinet' in extracted_objects["view_5_fridge"]:
        extracted_objects["view_5_fridge"].remove('cabinet')
        print('step 5.3')
        
    # Step 4.4.2: Verify the detectron's result in VLM
    for image_name, objects in extracted_objects.items():
        image_path = find_image_path(image_name, case_id)
        if image_path is not None:
            for object_name in objects:
                prompt = f'is there a {object_name}?' 
                response = vlm_instructblip(image_path, prompt)
                # print(f'prompt:{prompt}, and its response:{response}')
                if 'no' in response: # if VLM does not identify the object, remove it
                    print(f'remove {object_name} from image {image_name}')
                    extracted_objects[image_name].remove(object_name)
                else:
                    continue

    # Step 4.4.3: Select prompt according to existing objects
    selected_predicate_based_prompts = {}
    for image_name, objects in extracted_objects.items():
        matched_prompts = []
        for sentence in all_predicate_based_prompts:
            object1 = sentence[1]
            object2 = sentence[2] if len(sentence) == 3 else None
            if object1 in objects and (not object2 or object2 in objects):
                matched_prompts.append(sentence[0])
        selected_predicate_based_prompts[image_name] = matched_prompts
        
    for image_name, predicate_based_prompts in selected_predicate_based_prompts.items():
        print(f'For {image_name}, predicate_based_prompts: {predicate_based_prompts}')
    
    # Step 4.5.1: Continue to ask VLM using the selected predicate-based prompts 
    initial_state = {}
    for image_name in ["view_2_object_on_table", "view_3_microwave", "view_4_cabinet", "view_5_fridge"]:
        image_path = find_image_path(image_name, case_id)
        if image_path is not None:
            for index_prompt in range(len(selected_predicate_based_prompts[image_name])):
                prompt = selected_predicate_based_prompts[image_name][index_prompt]    
                response = vlm_instructblip(image_path, prompt)
                print(f'image_name:{image_name} prompt:{prompt} response:{response}')
    
                if 'yes' in response.lower():
                    if image_name not in initial_state:
                        initial_state[image_name] = []
                    initial_state[image_name].append(prompt)
                else:
                    if 'open' in prompt: 
                        if image_name not in initial_state:
                            initial_state[image_name] = []
                        prompt = insert_not_before_open(prompt)
                        initial_state[image_name].append(prompt)
        else:
            continue
    
    formatted_state = convert_prompt_to_format(initial_state)
    print(f'\ninitial_state={formatted_state}')
    
    # Step 4.5.2: Save result
    all_lists = sum(formatted_state.values(), [])
    unique_lists = []
    for lst in all_lists:
        if lst not in unique_lists:
            unique_lists.append(lst)
    if not os.path.exists("/workspace/output"):
        os.makedirs("/workspace/output")
    with open(f'/workspace/output/case{case_id}_init_state.txt', 'w') as file:
        file.write(f'initial_state={formatted_state}' + '\n')
        file.write('PDDL: ' + str(unique_lists))

Object names: {'view_2_object_on_table': {'cup', 'table'}, 'view_3_microwave': {'faucet', 'handle', 'knob', 'toaster', 'clock', 'paper_towel', 'television_set', 'microwave_oven', 'kitchen_sink', 'glass_(drink_container)'}, 'view_4_cabinet': {'handle', 'knob', 'cabinet', 'tape_(sticky_cloth_or_paper)', 'microwave_oven', 'doorknob', 'mirror'}, 'view_5_fridge': {'refrigerator', 'vent', 'handle'}}
Objects in image view_2_object_on_table: ['cup', 'table']
Objects in image view_3_microwave: ['toaster']
Objects in image view_4_cabinet: ['cabinet']
Objects in image view_5_fridge: ['fridge']
step 3.1




For view_2_object_on_table, predicate_based_prompts: ['is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'cup', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['not open', 'fridge']]}
Object names:



For view_2_object_on_table, predicate_based_prompts: ['is banana on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'banana', 'table']], 'view_3_microwave': [['open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names:



For view_2_object_on_table, predicate_based_prompts: ['is lemon on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is apple in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'lemon', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['n



For view_2_object_on_table, predicate_based_prompts: ['is apple on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'apple', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['



For view_2_object_on_table, predicate_based_prompts: ['is avocado on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is avocado on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'avocado', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Obje



For view_2_object_on_table, predicate_based_prompts: ['is orange on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is apple in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is apple in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:ye



For view_2_object_on_table, predicate_based_prompts: []
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is orange in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is orange in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_3_microwave': [['open', 'microwave'], ['open', 'toaster']], 'view_4_cabinet': [['in', 'orange', 'cabinet'], ['open', 'cabinet']], 'view_5_fridge': [['not open', 'fridge']]}
Object names: {'view_2_object_on_table': 



For view_2_object_on_table, predicate_based_prompts: []
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_3_microwave': [['open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names: {'view_2_object_on_table': {'cup', 'table'}, 'view_3_microwave': {'faucet', 'handle', 'knob', 'toaster', 'clock', 'paper_towel', 'television_set', 'microwave_oven', 'kitchen_sin



For view_2_object_on_table, predicate_based_prompts: ['is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'cup', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names: {'



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], 'view_3_microwave': [['open', 'microwave'], ['open'



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge'



For view_2_object_on_table, predicate_based_prompts: ['is spoon on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is spoon on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'spoon', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['not open', 'fridge']]}
Object 



For view_2_object_on_table, predicate_based_prompts: ['is fork on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is fork on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'fork', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names: {'



remove fork from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: []
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is apple in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_3_microwave': [['open', 'microwave'], ['open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['in', 'apple', 'fridge'], ['open', 'fridge']]}
Ob



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?', 'is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table'], ['on', 'cup', 'table']], 'view_3_microwave': [['not op



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?', 'is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table'], ['on', 'cup', 'table']], 'view_3_microwave': [['not 



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is apple in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], 'view_3_microwave': [['open', 'microwave'], ['open', 



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], 'view_3_microwave': [['open', 'microwave'], ['not o



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?', 'is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table'], ['on', 'cup', 'table']], 'view_3_microwave': [['not op



For view_2_object_on_table, predicate_based_prompts: ['is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'cup', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names: {'vie



For view_2_object_on_table, predicate_based_prompts: ['is spoon in bowl in this image?', 'is spoon on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is spoon in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is spoon on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:



For view_2_object_on_table, predicate_based_prompts: ['is spoon on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is spoon on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'spoon', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names:



For view_2_object_on_table, predicate_based_prompts: ['is fork on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is fork on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'fork', 'table']], 'view_3_microwave': [['open', 'microwave'], ['open'



For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], 'view_3_microwave': [['open', 'microwave'], ['not o



For view_2_object_on_table, predicate_based_prompts: ['is fork in bowl in this image?', 'is fork on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is fork in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is fork on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:yes




remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is spoon in cup in this image?', 'is spoon on table in this image?', 'is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is spoon in cup in this image? response:yes
image_name:view_2_object_on_table prompt:is spoon on table in this image? response:yes
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:



For view_2_object_on_table, predicate_based_prompts: ['is fork in cup in this image?', 'is fork on table in this image?', 'is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is fork in cup in this image? response:yes
image_name:view_2_object_on_table prompt:is fork on table in this image? response:yes
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is apple



For view_2_object_on_table, predicate_based_prompts: ['is cup on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is orange in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is cup on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is orange in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes





For view_2_object_on_table, predicate_based_prompts: ['is banana on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'banana', 'table']], 'view_3_microwave': [['not open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['open', 'cabinet']], 'view_5_fridge': [['not open', 'fridge']]}
Object 



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is apple on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is orange in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is orange in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'apple', 'table']], 'vie



For view_2_object_on_table, predicate_based_prompts: ['is lemon on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'lemon', 'table']], 'view_3_microwave': [['open', 'microwave'], ['open', 'toaster']], 'view_4_cabinet': [['not open', 'cabinet']], 'view_5_fridge': [['open', 'fridge']]}
Object names: {'



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is orange on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'orange', 'table']], 'vie



For view_2_object_on_table, predicate_based_prompts: ['is avocado on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is apple in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is avocado on table in this image? response:no
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is apple in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_3_microwave': [['open', 'microwave'], ['not open', 'toaster']], 'view_4_cabinet': [['in', 'ap



For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple in plate in this image?', 'is apple on table in this image?', 'is bowl on table in this image?', 'is plate on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple in plate in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_2_object_on_table prompt:is plate on table in this image? response:yes
image_name:view_3_mic



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is lemon on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is banana in fridge in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:yes

initial_state={'view_2_object_on_table': [['on', 'lemon', 'table']], 'view_3_



For view_2_object_on_table, predicate_based_prompts: ['is avocado in bowl in this image?', 'is avocado on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is banana in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is avocado in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is avocado on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is banana in cabinet in this 



remove lemon from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: ['is orange in bowl in this image?', 'is orange on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is orange in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_



For view_2_object_on_table, predicate_based_prompts: ['is banana in bowl in this image?', 'is banana in plate in this image?', 'is banana on table in this image?', 'is bowl on table in this image?', 'is plate on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is banana in plate in this image? response:yes
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_2_object_on_table prompt:is plate on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open 



remove lemon from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is orange in bowl in this image?', 'is orange on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is orange in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in thi



remove avocado from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fr



For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is lemon in bowl in this image?', 'is lemon on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is banana in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_micr



remove apple from image view_2_object_on_table
remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is banana on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'banana', 'table']], 'view_3_microwave': [['open', 'microwave'], ['not open', 'toaster']], 'view_



For view_2_object_on_table, predicate_based_prompts: ['is banana in bowl in this image?', 'is banana on table in this image?', 'is apple in bowl in this image?', 'is apple on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in t



remove lemon from image view_2_object_on_table
remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is orange in bowl in this image?', 'is orange on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is orange in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_2



remove avocado from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:yes
image_name:view_3_microwave prompt:is toaster open in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:no
image_name:view_5_fr



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is apple in bowl in this image?', 'is apple on table in this image?', 'is lemon in bowl in this image?', 'is lemon on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is orange in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is banana in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is apple in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is apple on table in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_2_object_on_t



For view_2_object_on_table, predicate_based_prompts: ['is banana in bowl in this image?', 'is banana on table in this image?', 'is lemon in bowl in this image?', 'is lemon on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in t



remove avocado from image view_4_cabinet
For view_2_object_on_table, predicate_based_prompts: ['is banana in bowl in this image?', 'is banana on table in this image?', 'is orange in bowl in this image?', 'is orange on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is lemon in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_2_object_on_table prompt:is orange in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is orange on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in



For view_2_object_on_table, predicate_based_prompts: ['is banana in bowl in this image?', 'is banana on table in this image?', 'is avocado in bowl in this image?', 'is avocado on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is apple in fridge in this image?', 'is fridge open in this image?']
image_name:view_2_object_on_table prompt:is banana in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is banana on table in this image? response:yes
image_name:view_2_object_on_table prompt:is avocado in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is avocado on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:v



remove avocado from image view_2_object_on_table
For view_2_object_on_table, predicate_based_prompts: ['is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is banana in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:view_3_microwave prompt:is microwave open in this image? response:no
image_name:view_3_microwave prompt:is toaster open in this image? response:no
image_name:view_4_cabinet prompt:is banana in cabinet in this image? response:yes
image_name:view_4_cabinet prompt:is cabinet open in this image? response:yes
image_name:view_5_fridge prompt:is fridge open in this image? response:no

initial_state={'view_2_object_on_table': [['on', 'bowl', 'table']], '



For view_2_object_on_table, predicate_based_prompts: ['is avocado in bowl in this image?', 'is avocado on table in this image?', 'is lemon in bowl in this image?', 'is lemon on table in this image?', 'is bowl on table in this image?']
For view_3_microwave, predicate_based_prompts: ['is microwave open in this image?', 'is toaster open in this image?']
For view_4_cabinet, predicate_based_prompts: ['is banana in cabinet in this image?', 'is cabinet open in this image?']
For view_5_fridge, predicate_based_prompts: ['is fridge open in this image?']
image_name:view_2_object_on_table prompt:is avocado in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is avocado on table in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon in bowl in this image? response:yes
image_name:view_2_object_on_table prompt:is lemon on table in this image? response:yes
image_name:view_2_object_on_table prompt:is bowl on table in this image? response:yes
image_name:vie

# Test Module

In [83]:
image_name = 'view_3_microwave'
image_path = find_image_path(image_name, case_id)
print(image_path)
prompt = 'the item with a metallic luster is microwave or toaster?'
response = vlm_instructblip(image_path, prompt)
print(f'response:{response}')

/workspace/input/case20/view_3_microwave_2.jpg
response:microwave
