In [3]:
import base64
import os

from dotenv import load_dotenv
from openai import OpenAI

In [4]:
load_dotenv()
client = OpenAI(api_key=os.environ.get('API_KEY'))

In [68]:
import os
import json

def build_directory_json(root_dir):
    output = []
    models = os.listdir(root_dir)
    for elem in models:
        curr = os.path.join(root_dir, elem)
        prompt_cases = os.listdir(curr)
        for prompt in prompt_cases:
            curr_dict = {}
            curr_dict["Model"] = elem
            curr_dict["Images"] = []
            prompt_dir = os.path.join(curr, prompt)
            for seed_name in os.listdir(prompt_dir):
                seed_dir = os.path.join(prompt_dir, seed_name)
                if str(seed_name) == 'prompt.txt':
                    with open(seed_dir, 'r') as f:
                        line = f.readline()
                        curr_dict["Prompt"] = line 
                else:
                    seed_dict = {}
                    seed_dict["Seed"] = seed_name
                    seed_dict["Paths"] = []
                    for image_name in os.listdir(seed_dir):
                        image_dict = {}
                        image_dict["View"] = image_name.split('.')[0]
                        image_path = os.path.join(seed_dir, image_name)
                        image_dict["Path"] = image_path 
                        seed_dict["Paths"].append(image_dict)
                    curr_dict["Images"].append(seed_dict)
            output.append(curr_dict)
    return output

root_directory = 'images/' 
directory_json = build_directory_json(root_directory)

json_string = json.dumps(directory_json, indent=4)
with open('images.json', 'w', encoding='utf-8') as f:
    f.write(json_string)

print(json_string)

[
    {
        "Model": "mvdream",
        "Images": [
            {
                "Seed": "2",
                "Paths": []
            },
            {
                "Seed": "982",
                "Paths": [
                    {
                        "View": "back",
                        "Path": "images/mvdream/cheerleading/982/back.png"
                    },
                    {
                        "View": "front",
                        "Path": "images/mvdream/cheerleading/982/front.png"
                    },
                    {
                        "View": "left",
                        "Path": "images/mvdream/cheerleading/982/left.png"
                    },
                    {
                        "View": "right",
                        "Path": "images/mvdream/cheerleading/982/right.png"
                    }
                ]
            }
        ],
        "Prompt": "High school cheerleading"
    },
    {
        "Model": "mvdream",
        "Image

In [None]:
gpt4v_prompt = '''
Our task here is to compare two 3D objects, both generated from the same text prompt.
I will provide you with eight multi-view RGB renderings from the two objects, where the first four images are the 
front, back, left, and right views of 3D object 1, and the next four images are the front, back, left, and right views of 3D object 2.
We want to decide which of these two objects is better according to the following criteria. 

# Instruction
1. Text prompt and Asset Alignment. Focus on how well they correspond to the given text description. An ideal model should accurately reflect all objects
and surroundings mentioned in the text prompt, capturing the corresponding attributes as described. Please first describe each of the two models, and then
evaluate how well it covers all the attributes in the original text prompt.
2. 3D Plausibility. Look at both the RGB and normal images and imagine a 3D model from the multi-view images. Determine which model appears more natural,
solid, and plausible. Pay attention to any irregularities, such as abnormal body proportions, duplicated parts, or the presence of noisy or meaningless 3D
structures. An ideal model should possess accurate proportions, shapes, and structures that closely resemble the real-world object or scene.
3. Geometry-Texture Alignment. This examines how well the texture adheres to the geometry. The texture and shape should align with each other locally.
For instance, a flower should resemble a flower in both the RGB and normal map, rather than solely in the RGB. The RGB image and its corresponding normal
image should exhibit matching structures.
4. Low-Level Texture Details. Focus on local parts of the RGB images. Assess which model effectively captures fine details without appearing blurry and
which one aligns with the desired aesthetic of the 3D model. Note that overly abstract and stylized textures are not desired unless specifically mentioned in
the text prompt.
5. Low-Level Geometry Details. Focus on the local parts of the normal maps. The geometry should accurately represent the intended shape. Note that
meaningless noise is not considered as high-frequency details. Determine which one has a more well-organized and efficient structure, which one exhibits
intricate details, and which one is more visually pleasing and smooth.
6. Considering all the degrees above, which one is better overall?
Take a really close look at each of the multi-view images for these two 3D objects before providing your answer.
When evaluating these aspects, focus on one of them at a time.
Try to make independent decisions between these criteria.

# Output format
To provide an answer, please provide a short analysis for each of the abovementioned evaluation criteria. The analysis should be very concise and accurate.
For each of the criteria, you need to make a decision using these three options:
1. left (object 1) is better;
2. right (object 2) is better;
3. Cannot decide.
IMPORTANT: PLEASE USE THE THIRD OPTION SPARSELY.
Then, in the last row, summarize your final decision by "<option for criterion 1> <option for criterion 2> <option for criterion 3> <option for criterion 4> <option
for criterion 5> <option for criterion 6>".
# Example
"
Analysis:
1. Text prompt and Asset Alignment: The left one ...; The right one ...; The left/right one is better or cannot decide.
2. 3D Plausibility. The left one ...; The right one ...; The left/right one is better or cannot decide.
3. Geometry-Texture Alignment. The left one ...; The right one ...; The left/right one is better or cannot decide.
4. Low-Level Texture Details. The left one ...; The right one ...; The left/right one is better or cannot decide.
5. Low-Level Geometry Details. The left one ...; The right one ...; The left/right one is better or cannot decide.
6. Overall, ... The left/right one is better or cannot decide.
Final answer:
x x x x x x (e.g., 1 2 2 3 2 1 / 3 3 3 2 1 3 / 3 2 2 1 1 1)
"
'''

In [None]:
# body: gpt4v prompt + the two inputs 
# output: inference result as a dict 
def gpt4v_inference(prompt: str, data1: dict, data2: dict) -> dict:
    ...

In [1]:
# read json file
# for each subfolder within the images folder, iterate through each of the five prompts
# for each prompt, sample two seeds (4 images per seed). Not a random sample but every combo of seeds (6 combos).
# write a prompt for GPT4V that compares these two seeds and finds the preference, along with an explanation.
# then store this result to json. 
# repeat for each prompt (5 prompts), to get a total of 30 comparisons.

import json
from typing import List

def parse_images(image_file: str) -> List:
    with open(image_file, 'r') as f:
        image_list = json.load(f)
        output_list = []
        for item in image_list:
            if item['Model'] == 'mvdream':
                prompt = item['Prompt']
                seeds = item['Images']
                
                for i in range(len(seeds)):
                    for j in range(i+1, len(seeds)):
                        data_i = seeds[i]['Seed']
                        data_j = seeds[j]['Seed']
                        # output = gpt4v_inference(prompt, data_i, data_j)
                        # output_list.append(output)
                        # break: get rid of this part after we verify gpt4v inference
                        break 
            break 
        
    return output_list
                                  
parse_images('images.json')


[]