In [2]:
from bddl.knowledge_base import *

Loading BDDL knowledge base... This may take a few seconds.
[nltk_data] Downloading package wordnet to /home/cgokmen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import string

raw_prompt = """
You are a helpful assistant that generates average mass annotations for objects in a dataset of 3d models.
Give me the estimated average mass of each of the below listed objects in JSON format. The output should be a JSON
dictionary where the key is the object and the value is the mass in kilograms. Output only the JSON - no additional conversation.
When you have objects that look like x_box or x_bottle (e.g. wine_bottle, rice_bag), assume that those are empty containers.
Objects like bottle_of_x or box_of_x (e.g. bottle_of_wine, bag_of_rice) should be assumed to be full. For things like diced__apple,
respond with the mass of a single piece of diced apple. Suppose that "atomizer" usually means a spray bottle.
"""

valid_chars = set(string.ascii_lowercase) | {" "}
def valid_name(s):
    return True
    return all(c in valid_chars for c in s)

In [4]:
from getpass import getpass
OPENAI_API_TOKEN = getpass()

In [11]:
from openai import AsyncOpenAI
import json

client = AsyncOpenAI(api_key=OPENAI_API_TOKEN)

async def get_data_from_openai(message):
    messages = [
        # (message, summarizable?)
        {"role": "system", "content": raw_prompt},
        {"role": "user", "content": message},
    ]
    response = await client.chat.completions.create(
        model="gpt-4-turbo-preview",
        response_format={ "type": "json_object" },
        messages=messages,
    )
    response_message = response.choices[0].message.content
    return json.loads(response_message)


In [41]:
import tqdm.notebook as tqdm
import pathlib
import asyncio

BATCH_SIZE = 20
RESULT_PATH = pathlib.Path("/scr/ig_pipeline/metadata/gpt4_masses.json")

async def run_pass():
    gpt4_responses = {}
    if RESULT_PATH.exists():
        with open(RESULT_PATH, "r") as f:
            gpt4_responses = json.load(f)

    gpt_missing = [c for c in Category.all_objects() if c.name not in gpt4_responses]
    missing_names = {c.name for c in gpt_missing}

    print(len(gpt_missing))

    async_futures = []
    for batch_start in range(0, len(gpt_missing), BATCH_SIZE):
        missing_batch = gpt_missing[batch_start:batch_start+BATCH_SIZE]
        message = ""

        for c in missing_batch:
            cat = c.name
            synset = c.synset
            assert c.synset is not None

            if c.synset.definition:
                message += f'"{cat}": a kind of "{synset.name}", defined as "{synset.definition}"\n'
            else:
                parent_synset = synset.parents[0]
                assert parent_synset.definition
                message += f'"{cat}": a kind of "{synset.name}", which is a child of {parent_synset.name}, defined as "{parent_synset.definition}"\n'
            
        async_futures.append(get_data_from_openai(message))

    for result in tqdm.tqdm(asyncio.as_completed(async_futures), total=len(async_futures)):
        try:
            result_dict = await result
            # filtered_results = {k: v for k, v in result_dict.items() if k in missing_names}
            gpt4_responses.update(result_dict)
        except Exception as e:
            print(e)

    with open(RESULT_PATH, "w") as f:
        json.dump(gpt4_responses, f)

In [42]:
await run_pass()

214


  0%|          | 0/11 [00:00<?, ?it/s]

In [44]:
with open(RESULT_PATH, "r") as f:
    gpt4_responses = json.load(f)

print(gpt4_responses)
print("Missing categories", len([c for c in Category.all_objects() if c.name not in gpt4_responses]))
cat_names = {c.name for c in Category.all_objects()}
print("Extra categories", len([x for x in gpt4_responses.keys() if x not in cat_names]))

{'acetone_atomizer': 0.5, 'acorn': 0.01, 'address': 0, 'adhesive_material': 1, 'agave': 5, 'air_conditioner': 40, 'air_filter': 1, 'air_freshener_atomizer': 0.5, 'alarm_clock': 0.5, 'alga': 0.2, 'allen_wrench': 0.1, 'allspice': 0.005, 'allspice_shaker': 0.2, 'almond': 0.001, 'alphabet_abacus': 1.5, 'aluminum_foil': 0.5, 'ammonia_water_atomizer': 0.5, 'antlers': 5, 'apple': 0.2, 'apple_pie': 1, 'apricot': 0.035, 'bag_of_yeast': 0.25, 'bagel': 0.085, 'bagel_dough': 1, 'baguette': 0.25, 'bait': 0.02, 'baking_powder': 0.25, 'baking_powder_jar': 0.5, 'baking_sheet': 1, 'balloon': 0.008, 'banana': 0.12, 'banana_bread': 0.28, 'bandage': 0.05, 'bandana': 0.08, 'bap': 0.06, 'bar': 50, 'bar_soap': 0.15, 'barbecue_sauce_bottle': 0.5, 'baseball': 0.145, 'baseball_bat': 0.9, 'baseball_cap': 0.11, 'baseball_glove': 0.3, 'baseboard': 5, 'basil': 0.02, 'basil_jar': 0.15, 'bath_rug': 1.5, 'bath_towel': 0.5, 'bathtub': 150, 'battery': 0.045, 'batting_gloves': 0.2, 'bay_leaf': 0.002, 'beach_toy': 0.2, 'b

In [45]:
# Now print everything in the order they show up in the category sheet so that we can paste this there.
import csv
with open("/scr/ig_pipeline/metadata/category_mapping.csv", "r") as f:
    category_list = list(csv.DictReader(f))

for cat in category_list:
    name = cat["category"]
    if name in gpt4_responses:
        print(gpt4_responses[name])
    else:
        print("")

1000
15000
500
25
150
90
20
40.0
35.0
20
1.5
10
30
5
20
14
50
20
45
40
30
8.0
3.5
75
75.0
75
3
10
2
2.5
18
5
4
45
20
270
0.3
0.7
250
5
50
18
10.0
5
8
12
35
4.5
5
8
40
5
10
2
30.0
35
3
4
90.0
1.2
8
75
15
0.2
0.035
0.3
0.12
0.175
0.2
0.4
0.3
1
0.05
2
0.25
0.3
0.4
0.5
0.5
0.15
0.2
1.5
0.008
0.008
0.004
0.005
1.2
0.01
7
0.5
0.3
0.2
1.0
0.1
0.3
1.5
0.05
0.2
2.0
0.01
0.005
0.3
2.5
0.09
0.004
0.4
0.05
0.008
0.075
1.25
0.03
0.2
0.5
85
1
0.4
0.5
1
4
0.1
0.15
0.5
0.4
0.4
0.02
0.005
0.9
0.2
0.001
0.02
0.5
0.05
0.005
0.2
0.2
0.01
0.006
0.2
0.3
2
0.001
0.08
0.25
1.0
9
0.2
4.5
0.025
0.002
0.005
1.0
0.1
0.1
0.02
0.02
0.001
0.012
0.3
0.08
0.07
1.0
0.002
0.3
0.3
0.15
0.015
9
10
1.5
0.3
0.75
0.1
1.2
0.8
0.5
0.3
0.9
0.8
0.25
0.1
0.5
1.0
0.5
0.1
0.2
0.008
0.4
0.25
0.2
0.05
0.4
0.5
0.1
0.2
1.5
5
600
1.2
0.01
0.3
0.2
0.2
1
0.5
0.1
0.015
0.2
0.7
0.001
0.5
0.025
2
2
0.002
0.02
0.2
0.1
0.1
0.015
1.2
5
0.4
0.1
1
0.005
0.1
1
3
0.7
0.1
0.5
2
0.2
0.05
0.4
7.5
0.5
1.5
1.5
0.2
1500
1.5
0.5
40
2.5
0.5
1.2
0.25
10
0.0