In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import json
from PIL import Image
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

def load_and_resize_image(image_path, max_size=1000):
    """
    安全加载图片，自动检查、降分辨率。
    
    Args:
        image_path (str): 图片路径
        max_size (int): 模型支持的最大分辨率 (shorter edge)
    
    Returns:
        PIL.Image: RGB 图片，已 resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"❌ Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # 如果图片太大则等比缩放
    width, height = img.size
    if max(width, height) > max_size:
        # 等比缩放，最大边= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.07it/s]
Device set to use cuda:0


## EgoOrientBench

In [2]:
image_base_path = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/"
json_file = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/benchmark.json"
import json
from PIL import Image
from tqdm.auto import tqdm
with open(json_file, "r") as f:
    data = json.load(f)
import os

def evaluate_spaceom(pipe, data):
    acc = 0
    total = len(data)
    results = []
    for item in tqdm(data):
        image_path = f"{image_base_path}/{item['image']}"
        # img = Image.open(image_path).convert("RGB")
        img = load_and_resize_image(image_path, max_size=224)  # Ensure image is resized correctly

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": item["question"]}
                ]
            },
        ]
        response = pipe(text=messages)
        model_answer = response[0]["generated_text"][-1]["content"].strip()
        results.append({
            "image": item["image"],
            "question": item["question"],
            "answer": model_answer,
            "label": item["label"],
            "Accuracy": item["label"] in model_answer
        })
    #save results to a JSON file
    with open("EgoOrientBench_spaceom_results.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"Accuracy: {sum(1 for r in results if r['Accuracy']) / total * 100:.2f}%")
    print(f"Total: {total}")
    return results

In [None]:
evaluate_spaceom(pipe, data)

In [4]:
from joblib import Parallel, delayed
import torch
import json
import os
from tqdm.auto import tqdm
from PIL import Image

# 数据路径
image_base_path = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/"
json_file = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/benchmark.json"

with open(json_file, "r") as f:
    data = json.load(f)

print(f"Total samples: {len(data)}")

Total samples: 33460


In [7]:
def load_pipe():
    from transformers import pipeline
    pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm",
        device=0  # 注意这里一定写 device=0，因为每个进程都只看到自己的 GPU
    )
    return pipe

In [8]:
def chunkify(lst, n):
    """把列表均分为 n 份"""
    return [lst[i::n] for i in range(n)]

def load_and_resize_image(path, max_size=224):
    img = Image.open(path).convert("RGB")
    img.thumbnail((max_size, max_size))
    return img


def worker(gpu_id, data_chunk):
    import os
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    import torch

    # TODO: 在这里重新加载 pipe
    # 例如：
    # from transformers import pipeline
    # pipe = pipeline("your-task", device=0)
    pipe = load_pipe()  # 你需要自己实现这个

    results = []
    for item in tqdm(data_chunk, desc=f"GPU {gpu_id}"):
        image_path = f"{image_base_path}/{item['image']}"
        img = load_and_resize_image(image_path, max_size=224)

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": item["question"]}
                ]
            },
        ]
        response = pipe(text=messages)
        model_answer = response[0]["generated_text"][-1]["content"].strip()
        results.append({
            "image": item["image"],
            "question": item["question"],
            "answer": model_answer,
            "label": item["label"],
            "Accuracy": item["label"] in model_answer
        })
    return results

num_gpus = torch.cuda.device_count()
print(f"Detected GPUs: {num_gpus}")

# 均匀分块
chunks = chunkify(data, num_gpus)

# 并行执行
all_results = Parallel(n_jobs=num_gpus)(
    delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
)

# 展平
flat_results = [item for sublist in all_results for item in sublist]

# 保存
with open("EgoOrientBench_spaceom_results.json", "w") as f:
    json.dump(flat_results, f, indent=4)

acc = sum(1 for r in flat_results if r['Accuracy']) / len(flat_results)
print(f"Accuracy: {acc * 100:.2f}%")
print(f"Total: {len(flat_results)}")

Detected GPUs: 4


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.86it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.66it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.75it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.48it/s]
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
GPU 1:   0%|          | 4/8365 [00:01<44:14,  3.15it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GPU 0:   0%|          | 10/8365 [00:01<16:19,  8.53it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GPU 1:   0%|          | 10/8365 [00:02<31:01,  4.49it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GPU 3

Accuracy: 12.45%
Total: 33460


GPU 1: 100%|██████████| 8365/8365 [36:01<00:00,  3.87it/s]


In [12]:
path = "/home/disheng/Spatial_Survey/Spatial_VLM_Survey/code/evaluation/EgoOrientBench_spaceom_results.json"
data = json.load(open(path, "r"))
acc  = 0
for item in data:
    answer = item["answer"].lower()
    label = item["label"].lower()
    if label in answer:
        acc += 1
print(f"Accuracy: {acc / len(data) * 100:.2f}%")

Accuracy: 40.82%


## GeoMeter

### real data

In [119]:
# %%
# 🚀 1) 环境准备
from pathlib import Path
from PIL import Image
import json
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
import re

# %%
# ⚙️ 2) 参数配置
IMAGE_BASE   = Path("/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/")
JSONL_FILE   = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/depth_height_1000_realworld.jsonl"
MODEL_NAME   = "remyxai/SpaceOm"
TASK         = "image-text-to-text"
DEVICE_ID    = 2      # 单卡就用 0
BATCH_SIZE   = 1     # 根据显存调

# %%
# 📖 3) 读数据
data = []
with open(JSONL_FILE, "r") as f:
    for line in f:
        item = json.loads(line)
        assert "images" in item and "query_text" in item and "target_text" in item
        data.append(item)

# %%
# 🔧 4) 初始化 pipeline
pipe = pipeline(
    TASK,
    model=MODEL_NAME,
    device=DEVICE_ID,
    batch_size=BATCH_SIZE,
)

# %%
# 🏃 5) 批量推理 + 进度条
results = []
for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Inference"):
    batch = data[i : i + BATCH_SIZE]
    messages = []
    for item in batch:
        img = Image.open(IMAGE_BASE / item["images"][0]).convert("RGB")
        messages.append({
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text",  "text":  item["query_text"]}
            ]
        })

    outputs = pipe(text=messages)

    for item, out in zip(batch, outputs):
        # 取 generated_text 这个 list，然后找出 assistant 那条
        gen_list = out.get("generated_text", [])
        assistant_entry = next(
            (entry for entry in gen_list if entry.get("role")=="assistant"),
            None
        )
        if assistant_entry is None:
            # 万一没找到，就降级处理
            raw_text = str(gen_list)
        else:
            raw_text = assistant_entry.get("content", "")

        # 用正则去掉末尾多余的逗号、句号
        pred = re.sub(r"[，,\.。]+$", "", raw_text).strip()

        results.append({
            "image": item["images"][0],
            "query": item["query_text"],
            "pred":  pred,
            "gold":  item["target_text"]
        })

# %%
# 📊 6) 计算准确率 & 保存
accuracy = 0.0
df = pd.DataFrame(results)
for pred, gold in zip(df["pred"], df["gold"]):
    if pred.lower() in gold.lower():
        accuracy += 1
accuracy /= len(df)
print(f"▶️ Overall Accuracy: {accuracy:.2%}")
print("number of accurate samples:", int(accuracy * len(df)))
print("number of samples:", len(df))

# df.to_json("geobench_real_results_with_predictions.json",
#            orient="records", indent=2, force_ascii=False)
# print("✅ 结果已保存到 geobench_real_results_with_predictions.{json,csv}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 16.25it/s]
Device set to use cuda:2
Inference: 100%|██████████| 100/100 [00:18<00:00,  5.50it/s]

▶️ Overall Accuracy: 43.00%
number of accurate samples: 43
number of samples: 100





### synthetic data

### depth

In [None]:

# depth

base_ir = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth"
dir_list = os.listdir(base_ir)

depth_results = {}
depth_3_shapes = {}
depth_5_shapes = {}
for each_sub_set in dir_list: # [images-3-shapes, images-5-shapes]
    each_shape_set_results = {}
    
    for each_example in os.listdir(os.path.join(base_ir, each_sub_set, "prompts")):  #each prompt
        prompts_json = os.path.join(base_ir, each_sub_set, "prompts", each_example)
        with open(prompts_json, "r") as f:
            prompts = json.load(f)

        image_name = prompts["filename"]
        image_path = os.path.join(base_ir, each_sub_set, "imgs", image_name)
        image_labelled_path = os.path.join(base_ir, each_sub_set, "labelled", image_name)
        image_labelled_id_path = os.path.join(base_ir, each_sub_set, "labelled_id", image_name)
        image_labelled_id_reverse_path = os.path.join(base_ir, each_sub_set, "labelled_id_reverse", image_name)

        prompts_plain = prompts["prompts"]
        prompts_labelled = prompts["prompts_labelled"]
        prompts_labelled_id = prompts["prompts_labelled_id"]

        plain_results = {}
        labelled_results = {}
        labelled_id_results = {}
        labelled_id_reverse_results = {}

        for index, each_prompt in enumerate(prompts_plain):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]

            plain_results[index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }
        
        for index, each_prompt in enumerate(prompts_labelled):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"] 

            labelled_results[index] = {
                "image": image_labelled_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

        for index, each_prompt in enumerate(prompts_labelled_id):
            answer = each_prompt["answer"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_id_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]     
            labelled_id_results[index] = {
                "image": image_labelled_id_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

        for index, each_prompt in enumerate(prompts_labelled_id):
            answer = each_prompt["answerReverse"]
            textual_prompt = each_prompt["prompt"] 
            answer_set = ";\n".join(each_prompt["answerSet"])

            full_prompt = textual_prompt + "\nAnswer Set:\n" + answer_set

            messages = []
            img = load_and_resize_image(image_labelled_id_reverse_path)
            messages.append({
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text",  "text":  full_prompt}
                ]
            })
            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
            labelled_id_reverse_results[index] = {
                "image": image_labelled_id_reverse_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }
        if "3-shapes" in each_sub_set:
            depth_3_shapes[each_sub_set] = {
                "plain": plain_results,
                "labelled": labelled_results,
                "labelled_id": labelled_id_results,
                "labelled_id_reverse": labelled_id_reverse_results
            }
        elif "5-shapes" in each_sub_set:
            depth_5_shapes[each_sub_set] = {
                "plain": plain_results,
                "labelled": labelled_results,
                "labelled_id": labelled_id_results,
                "labelled_id_reverse": labelled_id_reverse_results
            }   

    depth_results[each_sub_set] = {
        "plain": plain_results, 
        "labelled": labelled_results,
        "labelled_id": labelled_id_results,
        "labelled_id_reverse": labelled_id_reverse_results
    }

# Save results          
import json
depth_results_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json"
with open(depth_results_path, "w") as f:
    json.dump({
        "depth_3_shapes": depth_3_shapes,
        "depth_5_shapes": depth_5_shapes,
        "depth_results": depth_results
    }, f, indent=4)     


In [68]:
def load_and_resize_image(path, max_size=224):
    img = Image.open(path).convert("RGB")
    img.thumbnail((max_size, max_size))
    return img

def chunkify(lst, n):
    return [lst[i::n] for i in range(n)]

def load_pipe():
    from transformers import pipeline
    pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm",
    device=0  # 注意这里一定写 device=0，因为每个进程都只看到自己的 GPU
    )
    return pipe

In [71]:
def process_subset(subset_name):
    from tqdm.auto import tqdm

    pipe = load_pipe()  # 只在这里加载，每个 GPU 一份

    each_shape_set_results = {}

    prompts_dir = os.path.join(base_ir, subset_name, "prompts")
    examples = os.listdir(prompts_dir)

    for each_example in tqdm(examples, desc=f"Subset {subset_name}"):
        with open(os.path.join(prompts_dir, each_example), "r") as f:
            prompts = json.load(f)

        img_base = os.path.join(base_ir, subset_name)
        paths = {
            "plain": os.path.join(img_base, "imgs", prompts["filename"]),
            "labelled": os.path.join(img_base, "labelled", prompts["filename_labelled"]),
            "labelled_id": os.path.join(img_base, "labelled_id", prompts["filename_labelled"]),
            "labelled_reverse_id": os.path.join(img_base, "labelled_reverse_id", prompts["filename_labelled"])
        }

        def run_prompts(prompts_list, img_path, is_reverse=False):
            output = {}
            for index, p in enumerate(prompts_list):
                answer = p["answerReverse"] if is_reverse else p["answer"]
                full_prompt = p["prompt"] + "\nAnswer Set:\n" + ";\n".join(p["answerSet"])
                img = load_and_resize_image(img_path)

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image", "image": img},
                        {"type": "text", "text": full_prompt}
                    ]
                }]
                try:
                    inference_result = pipe(text=messages)
                    final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
                except Exception as e:
                    print(f"Error: {e}")
                    final_answer = "ERROR"

                output[index] = {
                    "image": img_path,
                    "prompt": full_prompt,
                    "answer": answer,
                    "pred": final_answer,
                    "accuracy": answer.lower() in final_answer.lower()
                }
            return output

        plain = run_prompts(prompts["prompts"], paths["plain"])
        labelled = run_prompts(prompts["prompts_labelled"], paths["labelled"])
        labelled_id = run_prompts(prompts["prompts_labelled_id"], paths["labelled_id"])
        labelled_reverse_id = run_prompts(prompts["prompts_labelled_id"], paths["labelled_reverse_id"], is_reverse=True)

        each_shape_set_results[each_example] = {
            "plain": plain,
            "labelled": labelled,
            "labelled_id": labelled_id,
            "labelled_reverse_id": labelled_reverse_id
        }

    return (subset_name, each_shape_set_results)


def main_parallel():
    num_gpus = 4  # 你有 4 张 GPU
    print(f"Using {num_gpus} GPUs")

    chunks = chunkify(dir_list, num_gpus)

    def worker(gpu_id, chunk):
        import os
        os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
        results = {}
        for subset in chunk:
            name, res = process_subset(subset)
            results[name] = res
        return results

    from joblib import Parallel, delayed

    all_results = Parallel(n_jobs=num_gpus)(
        delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
    )

    # 合并
    depth_results = {}
    for part in all_results:
        depth_results.update(part)

    depth_3_shapes = {k: v for k, v in depth_results.items() if "3-shapes" in k}
    depth_5_shapes = {k: v for k, v in depth_results.items() if "5-shapes" in k}

    save_path = os.path.join(base_ir, "depth_results.json")
    with open(save_path, "w") as f:
        json.dump({
            "depth_3_shapes": depth_3_shapes,
            "depth_5_shapes": depth_5_shapes,
            "depth_results": depth_results
        }, f, indent=4)

    print(f"Saved to: {save_path}")

main_parallel()

Using 4 GPUs


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.59it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.88it/s]
Device set to use cuda:0
Device set to use cuda:0
Subset images-5-shapes:   0%|          | 0/400 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Subset images-3-shapes: 100%|██████████| 100/100 [09:03<00:00,  5.43s/it] 
Subset images-5-shapes: 100%|██████████| 400/400 [1:01:12<00:00,  9.18s/it]


Saved to: /home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json


In [118]:
path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/depth/depth_results.json"
import json
from pathlib import Path
from tqdm.auto import tqdm
depth_results = json.load(open(path, "r"))
depth_3_shapes = depth_results["depth_3_shapes"]
depth_5_shapes = depth_results["depth_5_shapes"]
depth_results = depth_results["depth_results"]
acc = 0
total = 0
for json in depth_3_shapes["images-3-shapes"]:
    plain = depth_3_shapes["images-3-shapes"][json]["plain"]
    labelled = depth_3_shapes["images-3-shapes"][json]["labelled"]
    labelled_id = depth_3_shapes["images-3-shapes"][json]["labelled_id"]
    labelled_reverse_id = depth_3_shapes["images-3-shapes"][json]["labelled_reverse_id"]
    acc += sum(1 for v in plain.values() if v["accuracy"])
    acc += sum(1 for v in labelled.values() if v["accuracy"])
    acc += sum(1 for v in labelled_id.values() if v["accuracy"])
    acc += sum(1 for v in labelled_reverse_id.values() if v["accuracy"])
    total += len(plain) + len(labelled) + len(labelled_id) + len(labelled_reverse_id)
print(f"Accuracy: {acc / total * 100:.2f}%")

for json in depth_5_shapes["images-5-shapes"]:
    plain = depth_5_shapes["images-5-shapes"][json]["plain"]
    labelled = depth_5_shapes["images-5-shapes"][json]["labelled"]
    labelled_id = depth_5_shapes["images-5-shapes"][json]["labelled_id"]
    labelled_reverse_id = depth_5_shapes["images-5-shapes"][json]["labelled_reverse_id"]
    acc += sum(1 for v in plain.values() if v["accuracy"])
    acc += sum(1 for v in labelled.values() if v["accuracy"])
    acc += sum(1 for v in labelled_id.values() if v["accuracy"])
    acc += sum(1 for v in labelled_reverse_id.values() if v["accuracy"])
    total += len(plain) + len(labelled) + len(labelled_id) + len(labelled_reverse_id)
print(f"Accuracy: {acc / total * 100:.2f}%")
print("number of Accurate: ", acc)
print("number of Total: ", total)
    



Accuracy: 34.85%
Accuracy: 28.84%
number of Accurate:  4811
number of Total:  16684


### heigh

In [None]:
import os
import json
from tqdm.auto import tqdm

base_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height"
sub_classes = os.listdir(base_path)

height_results = {}

print(f"Found {len(sub_classes)} sub-classes: {sub_classes}")

for each_sub_class in sub_classes:
    print(f"Processing {each_sub_class}...")
    sub_class_path = os.path.join(base_path, each_sub_class)
    prompts_dir = os.path.join(sub_class_path, "prompts")
    prompts_files = os.listdir(prompts_dir)

    for each_prompt_file in tqdm(prompts_files, desc=f"{each_sub_class} prompts"):
        prompt_path = os.path.join(prompts_dir, each_prompt_file)
        with open(prompt_path, "r") as f:
            prompt_json = json.load(f)

        image_name = prompt_json["filename"]
        image_path = os.path.join(sub_class_path, "imgs", image_name)
        prompts_list = prompt_json["prompts"]

        for index, prompt in enumerate(prompts_list):
            answer = prompt["answer"]
            answer_set = ";\n".join(prompt["answerSet"])
            full_prompt = prompt["prompt"] + "\nAnswer Set:[\n" + answer_set + "\n]"

            img = load_and_resize_image(image_path)
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": full_prompt}
                ]
            }]

            inference_result = pipe(text=messages)
            final_answer = inference_result[0].get("generated_text", [])[-1]["content"]

            # 记录结果，分层结构: {sub_class: {image_name: {index: ...}}}
            height_results.setdefault(each_sub_class, {})
            height_results[each_sub_class].setdefault(image_name, {})
            height_results[each_sub_class][image_name][index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

# 保存结果
height_results_path = os.path.join(base_path, "height_results.json")
with open(height_results_path, "w") as f:
    json.dump(height_results, f, indent=4)

print(f"Results saved to: {height_results_path}")

In [114]:
import os
import json
from tqdm.auto import tqdm
from joblib import Parallel, delayed
import torch

# 路径
base_path = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height"
sub_classes = os.listdir(base_path)
print(f"Found {len(sub_classes)} sub-classes: {sub_classes}")

def chunkify(lst, n):
    """将列表均匀分块"""
    return [lst[i::n] for i in range(n)]

def process_height_sub_class(sub_class):
    from tqdm.auto import tqdm
    import os

    pipe = load_pipe()  # 每张 GPU 独立加载

    sub_class_path = os.path.join(base_path, sub_class)
    prompts_dir = os.path.join(sub_class_path, "prompts")
    prompt_files = os.listdir(prompts_dir)

    results = {}

    for each_prompt_file in tqdm(prompt_files, desc=f"Sub-class: {sub_class}"):
        with open(os.path.join(prompts_dir, each_prompt_file), "r") as f:
            prompt_json = json.load(f)

        image_name = prompt_json["filename"]
        image_path = os.path.join(sub_class_path, "imgs", image_name)
        prompts_list = prompt_json["prompts"]

        for index, prompt in enumerate(prompts_list):
            answer = prompt["answer"]
            answer_set = ";\n".join(prompt["answerSet"])
            full_prompt = prompt["prompt"] + "\nAnswer Set:[\n" + answer_set + "\n]\nJust select the answer from the set without any explanation."

            img = load_and_resize_image(image_path)
            messages = [{
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": full_prompt}
                ]
            }]

            try:
                inference_result = pipe(text=messages)
                final_answer = inference_result[0].get("generated_text", [])[-1]["content"]
            except Exception as e:
                print(f"Error: {e}")
                final_answer = "ERROR"

            # 结构: {sub_class: {image_name: {index: ...}}}
            results.setdefault(sub_class, {})
            results[sub_class].setdefault(image_name, {})
            results[sub_class][image_name][index] = {
                "image": image_path,
                "prompt": full_prompt,
                "answer": answer,
                "pred": final_answer,
                "accuracy": answer.lower() in final_answer.lower(),
            }

    return results

def worker(gpu_id, chunk):
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    gpu_results = {}

    for sub_class in chunk:
        res = process_height_sub_class(sub_class)
        gpu_results.update(res)

    return gpu_results

def run_height_parallel():
    num_gpus = torch.cuda.device_count()
    print(f"Detected GPUs: {num_gpus}")

    chunks = chunkify(sub_classes, num_gpus)

    all_results = Parallel(n_jobs=num_gpus)(
        delayed(worker)(gpu_id, chunk) for gpu_id, chunk in enumerate(chunks)
    )

    # 合并
    height_results = {}
    for part in all_results:
        height_results.update(part)

    # 保存
    save_path = os.path.join(base_path, "height_results.json")
    with open(save_path, "w") as f:
        json.dump(height_results, f, indent=4)

    print(f"Results saved to: {save_path}")


run_height_parallel()

Found 8 sub-classes: ['images-3-stacks-colored', 'images-3-stacks-stepped-colored', 'images-3-stacks-stepped', 'images-5-stacks-stepped', 'images-5-stacks-colored', 'images-3-stacks', 'images-5-stacks', 'images-5-stacks-stepped-colored']
Detected GPUs: 4


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.36it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.82it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.41it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.71it/s]
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Device set to use cuda:0
Sub-class: images-3-stacks-stepped:   2%|▏         | 4/200 [00:01<01:09,  2.81it/s].60it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Sub-class: images-5-stacks-stepped:   1%|          | 2/200 [00:01<02:41,  1.22it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a datase

Results saved to: /home/disheng/Spatial_Survey/Datasets/GeoMeter/Synthetic/height/height_results.json


Sub-class: images-5-stacks-colored: 100%|██████████| 250/250 [03:00<00:00,  1.39it/s]


In [117]:
acc = 0
total = 0
height_results_path = os.path.join(base_path, "height_results.json")
height_results = json.load(open(height_results_path, "r"))
for sub_class, images in height_results.items():
    for image_name, prompts in images.items():
        for index, result in prompts.items():
            if result["accuracy"]:
                acc += 1
            total += 1
print(f"Accuracy: {acc / total * 100:.2f}%")
print("number of accurate results:", acc)
print("total number of results:", total)

Accuracy: 24.23%
number of accurate results: 1713
total number of results: 7069


In [120]:
number_of_samples = 7069 + 100 + 16684
number_of_accurate_samples = 1713 + 4811 + 43
print("final Accuracy: ", number_of_accurate_samples / number_of_samples * 100)

final Accuracy:  27.53112815997988


## OmniSpatial

In [None]:
# export HF_ENDPOINT="https://hf-mirror.com"
!mkdir -p dataset
!huggingface-cli download --resume-download qizekun/OmniSpatial --local-dir dataset --repo-type dataset
!find dataset/ -name '*.zip' -exec unzip -o {} -d dataset/ \;
!rm -f dataset/*.zip && rm -rf dataset/__MACOSX

In [4]:
!export CUDA_VISIBLE_DEVICES=3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
from PIL import Image
import os

def load_and_resize_image(image_path, max_size=448):
    """
    安全加载图片，自动检查、降分辨率。
    
    Args:
        image_path (str): 图片路径
        max_size (int): 模型支持的最大分辨率 (shorter edge)
    
    Returns:
        PIL.Image: RGB 图片，已 resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"❌ Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # 如果图片太大则等比缩放
    width, height = img.size
    if max(width, height) > max_size:
        # 等比缩放，最大边= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

In [3]:
from tqdm.auto import tqdm
from transformers import pipeline
import json
from PIL import Image
# pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
question_types =  {0:"Dynamic_Reasoning", 
                   1:"Spatial_Interaction", 
                   2:"Complex_Logic", 
                   3:"Perspective_Taking"}
annotation = "/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/data.json"
import json
annotation_data = json.load(open(annotation, "r"))
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
record = {}
for item in tqdm(annotation_data):
    iid = item["id"]
    image_id = item["id"].split("_")[0]
    question_type = item["task_type"]

    question = item["question"]
    options = item["options"]
    full_prompt = f"Question: {question}\nOptions: {"; ".join(options)}. Please only retuen a correct option without analysis."
    label = item["answer"]
    if len(options) != 0:
        label = options[label]
    image_path = f"/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/{question_type}/{image_id}.png"
    img = load_and_resize_image(image_path, max_size=448)  # 使用安全加载函数
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {iid}: {e}")
        print(image_path)
        print(options)
        print(full_prompt)
        print(label)
        print(messages)
        print()
        continue
    model_answer = response[0]["generated_text"][-1]["content"]
    # record all of information for later analysis
    record[iid] = {
        "image_path": image_path,
        "question_type": question_type,
        "question": question,
        "options": options,
        "label": label,
        "model_answer": model_answer,
        "correct": model_answer == label
    }
# 保存结果
import json
with open("omnispatial_results.json", "w") as f:
    json.dump(record, f, indent=2, ensure_ascii=False)  


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 13.64it/s]
Device set to use cuda:0
  1%|          | 10/1533 [00:01<03:29,  7.27it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1533/1533 [04:14<00:00,  6.02it/s]


In [4]:
result = "/home/disheng/Spatial_Survey/omnispatial_results.json"
import json
record = json.load(open(result, "r"))
acc = 0
total = len(record)
correct ={}
wrong = {}
for id, item in record.items():
    label = item["label"]
    model_answer = item["model_answer"]
    if label in model_answer:
        acc += 1
        correct[id] = item
    else:
        wrong[id] = item
accuracy = acc / total
print(f"Accuracy: {accuracy:.2%}")
       


Accuracy: 43.70%


## VSI-Bench

In [None]:
from datasets import load_dataset

ds = load_dataset("nyu-visionx/VSI-Bench")

'No'

## SpatialRGPT-Bench

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("a8cheng/SpatialRGPT-Bench")