In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline
import json
from PIL import Image
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

def load_and_resize_image(image_path, max_size=1000):
    """
    安全加载图片，自动检查、降分辨率。
    
    Args:
        image_path (str): 图片路径
        max_size (int): 模型支持的最大分辨率 (shorter edge)
    
    Returns:
        PIL.Image: RGB 图片，已 resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"❌ Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # 如果图片太大则等比缩放
    width, height = img.size
    if max(width, height) > max_size:
        # 等比缩放，最大边= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 19.53it/s]
Device set to use cuda:0


## EgoOrientBench

In [3]:
image_base_path = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/"
json_file = "/home/disheng/Spatial_Survey/Datasets/EgoOrientBench/benchmark.json"
import json
from PIL import Image
from tqdm.auto import tqdm
with open(json_file, "r") as f:
    data = json.load(f)
import os

def evaluate_spaceom(pipe, data):
    acc = 0
    total = len(data)
    results = []
    for item in tqdm(data):
        image_path = f"{image_base_path}/{item['image']}"
        # img = Image.open(image_path).convert("RGB")
        img = load_and_resize_image(image_path, max_size=448)  # Ensure image is resized correctly

        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": img},
                    {"type": "text", "text": item["question"]}
                ]
            },
        ]
        response = pipe(text=messages)
        model_answer = response[0]["generated_text"][-1]["content"].strip()
        results.append({
            "image": item["image"],
            "question": item["question"],
            "answer": model_answer,
            "label": item["label"],
            "Accuracy": item["label"] in model_answer
        })
    #save results to a JSON file
    with open("EgoOrientBench_spaceom_results.json", "w") as f:
        json.dump(results, f, indent=4)
    print(f"Accuracy: {sum(1 for r in results if r['Accuracy']) / total * 100:.2f}%")
    print(f"Total: {total}")
    return results

In [4]:
evaluate_spaceom(pipe, data)

  0%|          | 9/33460 [00:01<1:45:24,  5.29it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
  7%|▋         | 2260/33460 [06:17<1:24:38,  6.14it/s]../aten/src/ATen/native/cuda/TensorCompare.cu:110: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.
  7%|▋         | 2260/33460 [06:17<1:26:56,  5.98it/s]


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## GeoMeter

In [2]:
# %%
# 🚀 1) 环境准备
from pathlib import Path
from PIL import Image
import json
import pandas as pd
from tqdm.auto import tqdm
from transformers import pipeline
import re

# %%
# ⚙️ 2) 参数配置
IMAGE_BASE   = Path("/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/")
JSONL_FILE   = "/home/disheng/Spatial_Survey/Datasets/GeoMeter/Real/depth_height_1000_realworld.jsonl"
MODEL_NAME   = "remyxai/SpaceOm"
TASK         = "image-text-to-text"
DEVICE_ID    = 2      # 单卡就用 0
BATCH_SIZE   = 1     # 根据显存调

# %%
# 📖 3) 读数据
data = []
with open(JSONL_FILE, "r") as f:
    for line in f:
        item = json.loads(line)
        assert "images" in item and "query_text" in item and "target_text" in item
        data.append(item)

# %%
# 🔧 4) 初始化 pipeline
pipe = pipeline(
    TASK,
    model=MODEL_NAME,
    device=DEVICE_ID,
    batch_size=BATCH_SIZE,
)

# %%
# 🏃 5) 批量推理 + 进度条
results = []
for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Inference"):
    batch = data[i : i + BATCH_SIZE]
    messages = []
    for item in batch:
        img = Image.open(IMAGE_BASE / item["images"][0]).convert("RGB")
        messages.append({
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text",  "text":  item["query_text"]}
            ]
        })

    outputs = pipe(text=messages)

    for item, out in zip(batch, outputs):
        # 取 generated_text 这个 list，然后找出 assistant 那条
        gen_list = out.get("generated_text", [])
        assistant_entry = next(
            (entry for entry in gen_list if entry.get("role")=="assistant"),
            None
        )
        if assistant_entry is None:
            # 万一没找到，就降级处理
            raw_text = str(gen_list)
        else:
            raw_text = assistant_entry.get("content", "")

        # 用正则去掉末尾多余的逗号、句号
        pred = re.sub(r"[，,\.。]+$", "", raw_text).strip()

        results.append({
            "image": item["images"][0],
            "query": item["query_text"],
            "pred":  pred,
            "gold":  item["target_text"]
        })

# %%
# 📊 6) 计算准确率 & 保存
df = pd.DataFrame(results)
accuracy = (df["pred"] == df["gold"]).mean()
print(f"▶️ Overall Accuracy: {accuracy:.2%}")

df.to_json("geobench_real_results_with_predictions.json",
           orient="records", indent=2, force_ascii=False)
df.to_csv("geobench_real_results_with_predictions.csv", index=False)
print("✅ 结果已保存到 geobench_real_results_with_predictions.{json,csv}")

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 18.14it/s]
Device set to use cuda:2
Inference:  10%|█         | 10/100 [00:02<00:19,  4.52it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Inference: 100%|██████████| 100/100 [00:18<00:00,  5.48it/s]

▶️ Overall Accuracy: 43.00%
✅ 结果已保存到 geobench_real_results_with_predictions.{json,csv}





## OmniSpatial

In [None]:
# export HF_ENDPOINT="https://hf-mirror.com"
!mkdir -p dataset
!huggingface-cli download --resume-download qizekun/OmniSpatial --local-dir dataset --repo-type dataset
!find dataset/ -name '*.zip' -exec unzip -o {} -d dataset/ \;
!rm -f dataset/*.zip && rm -rf dataset/__MACOSX

In [4]:
!export CUDA_VISIBLE_DEVICES=3

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [2]:
from PIL import Image
import os

def load_and_resize_image(image_path, max_size=448):
    """
    安全加载图片，自动检查、降分辨率。
    
    Args:
        image_path (str): 图片路径
        max_size (int): 模型支持的最大分辨率 (shorter edge)
    
    Returns:
        PIL.Image: RGB 图片，已 resize
    """
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"❌ Image not found: {image_path}")

    img = Image.open(image_path).convert("RGB")

    # 如果图片太大则等比缩放
    width, height = img.size
    if max(width, height) > max_size:
        # 等比缩放，最大边= max_size
        scale = max_size / max(width, height)
        new_size = (int(width * scale), int(height * scale))
        img = img.resize(new_size, Image.Resampling.LANCZOS)

    return img

In [3]:
from tqdm.auto import tqdm
from transformers import pipeline
import json
from PIL import Image
# pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
question_types =  {0:"Dynamic_Reasoning", 
                   1:"Spatial_Interaction", 
                   2:"Complex_Logic", 
                   3:"Perspective_Taking"}
annotation = "/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/data.json"
import json
annotation_data = json.load(open(annotation, "r"))
pipe = pipeline("image-text-to-text", model="remyxai/SpaceOm")
record = {}
for item in tqdm(annotation_data):
    iid = item["id"]
    image_id = item["id"].split("_")[0]
    question_type = item["task_type"]

    question = item["question"]
    options = item["options"]
    full_prompt = f"Question: {question}\nOptions: {"; ".join(options)}. Please only retuen a correct option without analysis."
    label = item["answer"]
    if len(options) != 0:
        label = options[label]
    image_path = f"/home/disheng/Spatial_Survey/Datasets/OmniSpatial/dataset/{question_type}/{image_id}.png"
    img = load_and_resize_image(image_path, max_size=448)  # 使用安全加载函数
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": img},
                {"type": "text", "text": full_prompt}
            ]
        },
    ]
    try:
        response = pipe(text=messages)
    except Exception as e:
        print(f"Error processing {iid}: {e}")
        print(image_path)
        print(options)
        print(full_prompt)
        print(label)
        print(messages)
        print()
        continue
    model_answer = response[0]["generated_text"][-1]["content"]
    # record all of information for later analysis
    record[iid] = {
        "image_path": image_path,
        "question_type": question_type,
        "question": question,
        "options": options,
        "label": label,
        "model_answer": model_answer,
        "correct": model_answer == label
    }
# 保存结果
import json
with open("omnispatial_results.json", "w") as f:
    json.dump(record, f, indent=2, ensure_ascii=False)  


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 13.64it/s]
Device set to use cuda:0
  1%|          | 10/1533 [00:01<03:29,  7.27it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1533/1533 [04:14<00:00,  6.02it/s]


In [4]:
result = "/home/disheng/Spatial_Survey/omnispatial_results.json"
import json
record = json.load(open(result, "r"))
acc = 0
total = len(record)
correct ={}
wrong = {}
for id, item in record.items():
    label = item["label"]
    model_answer = item["model_answer"]
    if label in model_answer:
        acc += 1
        correct[id] = item
    else:
        wrong[id] = item
accuracy = acc / total
print(f"Accuracy: {accuracy:.2%}")
       


Accuracy: 43.70%


## VSI-Bench

In [None]:
from datasets import load_dataset

ds = load_dataset("nyu-visionx/VSI-Bench")

'No'

## SpatialRGPT-Bench

In [None]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("a8cheng/SpatialRGPT-Bench")