In [69]:
import os, sys
import json
import numpy as np
import pandas as pd
import re

EPSILON = 1e-8

In [23]:
val_path = 'datasets/PhysicalAI-Spatial-Intelligence-Warehouse/val.json'
with open(val_path, 'r') as f:
    val = json.load(f)
df_val = pd.DataFrame(val)
print(f"Length val prediction: {len(val)}")
print(f"Columns: {val[0].keys()}")
df_val.head()

Length val prediction: 1942
Columns: dict_keys(['id', 'image', 'conversations', 'rle', 'category', 'normalized_answer', 'freeform_answer'])


Unnamed: 0,id,image,conversations,rle,category,normalized_answer,freeform_answer
0,aff5479b81c95b0194f58dbaaa041332,000315.png,"[{'from': 'human', 'value': 'From this viewpoi...","[{'size': [1080, 1920], 'counts': 'bngl081MYQ1...",left_right,left,The pallet [Region 0] is to the left of the pa...
1,d05fc8c61137b99b02b70625b5eb0eae,001505.png,"[{'from': 'human', 'value': '<image> Using the...","[{'size': [1080, 1920], 'counts': 'kgkY12fQ13M...",count,3,The shelf [Region 14] is the shelf on the righ...
2,343662060cf0598e66985d6160c93113,000705.png,"[{'from': 'human', 'value': '<image> Using the...","[{'size': [1080, 1920], 'counts': '_oRi01fQ11O...",count,2,"From the image's perspective, the buffer regio..."
3,a47efce6e660965efb52ca6986259d7d,001344.png,"[{'from': 'human', 'value': '<image> Consideri...","[{'size': [1080, 1920], 'counts': 'flY\16`Q14K...",distance,13.54,[Region 9] is the leftmost pallet from this vi...
4,cb526bbab4836632b87da0f0fa92c610,001021.png,"[{'from': 'human', 'value': 'From the current ...","[{'size': [1080, 1920], 'counts': ']SfQ13j1;cm...",mcq,1,The pallet [Region 1] is the leftmost object f...


In [7]:
val_pred_path = 'datasets/PhysicalAI-Spatial-Intelligence-Warehouse/outputs/fine-tuned_evaluation_details.json'
with open(val_pred_path, 'r') as f:
    val_pred = json.load(f)
print(f"Length val prediction: {len(val_pred)}")
print(f"Columns: {val_pred[0].keys()}")

Length val prediction: 1942
Columns: dict_keys(['id', 'category', 'question', 'model_answer', 'parsed_answer', 'ground_truth', 'is_correct'])


In [63]:
# check category
df_val['category'].to_list() == df_val_pred['category'].to_list()

True

In [87]:
# --- Your Custom Parsers ---
str_to_int = {'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10}

def parse_distance_output(text: str):
    match = re.search(r"(\d+\.\d+|\d+)\s*(?:meters?|m|meter)\b", text, re.IGNORECASE)
    if match:
        try: 
            return float(match.group(1))
        except (ValueError, IndexError):
            print(f"Not a number")
            return 5.
    return 5. # Return 0.0 if not found, as per your implementation

def parse_count_output(text: str):
    pattern = r'\b(one|two|three|four|five|six|seven|eight|nine|ten|\d+)\s+(?:pallets?|pallet?|buffers?|buffer?|transporters?|boxes?)\b'
    matches = re.findall(pattern, text, re.IGNORECASE)
    if not matches:
        return 3 # most frequency
    count_str = matches[0].lower()
    if count_str in str_to_int:
        return str_to_int[count_str]
    try:
        return int(count_str)
    except ValueError:
        return 3

def parse_mcq_output(text: str):
    match = re.search(r'\[Region (\d+)\]\s+is\s+(?:nearest|the nearest|the leftmost|the shortest|the closest|the rightmost)', text)
    if match:
        return int(match.group(1))
    return 2 # Return 0 as per your implementation

def parse_left_right_output(text: str):
    text_lower = text.lower()
    is_negated = "not" in text_lower or "incorrect" in text_lower
    is_left, is_right = "left" in text_lower, "right" in text_lower
    if is_left and not is_right:
        return "right" if is_negated else "left"
    if is_right and not is_left:
        return "left" if is_negated else "right"
    return None

def get_parsed_answer(model_freeform_text: str, question_category: str):
    """Dispatcher function to call the correct parser based on category."""
    parser_map = {
        "distance": parse_distance_output,
        "count": parse_count_output,
        "mcq": parse_mcq_output,
        "left_right": parse_left_right_output,
    }
    parser_func = parser_map.get(question_category)
    if parser_func:
        return parser_func(model_freeform_text)
    return None

In [10]:
df_val_pred = pd.DataFrame(val_pred)
df_val_pred.head()

Unnamed: 0,id,category,question,model_answer,parsed_answer,ground_truth,is_correct
0,aff5479b81c95b0194f58dbaaa041332,left_right,"<image>\nFrom this viewpoint, does the pallet ...",The pallet [Region 0] is positioned to the lef...,left,left,True
1,d05fc8c61137b99b02b70625b5eb0eae,count,<image>\nUsing the buffer masks <mask> <depth>...,The shelf [Region 14] is the shelf on the righ...,14,3,False
2,343662060cf0598e66985d6160c93113,count,<image>\nUsing the buffer masks <mask> <depth>...,The buffer region [Region 2] is the leftmost b...,2,2,True
3,a47efce6e660965efb52ca6986259d7d,distance,<image>\nConsidering the pallets <mask> <depth...,[Region 9] is the leftmost pallet among all th...,13.6,13.54,True
4,cb526bbab4836632b87da0f0fa92c610,mcq,"<image>\nFrom the current viewing position, wh...",The pallet [Region 1] is the leftmost object a...,1,1,True


In [16]:
df_val_pred_lr = df_val_pred[df_val_pred['category'] == 'left_right']
print(len(df_val_pred_lr))
df_val_pred_lr.head()

500


Unnamed: 0,id,category,question,model_answer,parsed_answer,ground_truth,is_correct
0,aff5479b81c95b0194f58dbaaa041332,left_right,"<image>\nFrom this viewpoint, does the pallet ...",The pallet [Region 0] is positioned to the lef...,left,left,True
9,f20376002f9a9ce347d5caedd84a24fc,left_right,"<image>\nLooking from this perspective, is the...",The pallet [Region 0] is positioned to the rig...,right,right,True
10,7f0251e424358f13865f42951e264b74,left_right,<image>\nCan you determine if the pallet <mask...,The pallet [Region 0] is positioned to the lef...,left,left,True
15,a0d9809acaadc072a9208daf4e66691c,left_right,"<image>\nFrom the image's perspective, is the ...",The pallet [Region 0] is positioned to the lef...,left,left,True
16,eae026c6488f7400443b301040e48ade,left_right,"<image>\nFrom the image's perspective, is the ...",The pallet [Region 0] is positioned to the lef...,left,left,True


## left_right

In [58]:
df_val_pred_lr = df_val_pred[df_val_pred['category'] == 'left_right']
df_val_lr = df_val[df_val['category']=='left_right']
lr_pred = []
lr_gt = df_val_lr['normalized_answer']
count = 0
for i in range(len(df_val_pred_lr)):
    answer = df_val_pred_lr.iloc[i]['model_answer']
    category = df_val_pred_lr.iloc[i]['category']
    parsed_answer = get_parsed_answer(answer, category)
    lr_pred.append(parsed_answer)
    if parsed_answer == df_val_lr.iloc[i]['normalized_answer']:
        count += 1
    else: 
        print(f"Q: {df_val_lr.iloc[i]['conversations'][0]['value']}")
        print(f"A:   {df_val_lr.iloc[i]['conversations'][1]['value']}")
        print(f"P_A: {df_val_pred_lr.iloc[i]['model_answer']}")
        print(f"N_A:   {df_val_lr.iloc[i]['normalized_answer']}")
        print(f"N_A_P: {df_val_pred_lr.iloc[i]['parsed_answer']}")
#     lr_pred.append(parsed_answer)
acc = sum([pred == gt for pred, gt in zip(lr_pred, lr_gt)]) / len(df_val_lr)
print(f"Acc: {acc}")
print(f"Accuracy: {count/len(df_val_lr):.5f}%")

Q: Looking from this perspective, is the pallet <mask> to the right of the pallet <mask>?
A:   From this viewpoint, the pallet [Region 0] is on the left of the pallet [Region 1].
P_A: The pallet [Region 0] is positioned to the right of the pallet [Region 1].
N_A:   left
N_A_P: right
Acc: 0.998
Accuracy: 0.99800%


## Distance

In [98]:
df_val_pred_distance = df_val_pred[df_val_pred['category'] == 'distance']
df_val_distance = df_val[df_val['category']=='distance']
pred = []
gt = df_val_distance['normalized_answer']
error_rates = []
count = 0
for i in range(len(df_val_pred_distance)):
    if df_val_distance.iloc[i]['id'] != df_val_pred_distance.iloc[i]['id']:
        print(f"Error")
    else:
        answer = df_val_pred_distance.iloc[i]['model_answer']
        category = df_val_pred_distance.iloc[i]['category']
        
        parsed_answer = get_parsed_answer(answer, category)
        gt_answer = df_val_distance.iloc[i]['normalized_answer']
        lr_pred.append(parsed_answer)

        success = (parsed_answer <= (1.10 * gt_answer)) and (
                    parsed_answer >= (0.90 * gt_answer)
                )
        error_rate = (np.abs(parsed_answer - gt_answer)) / (gt_answer + EPSILON)
        error_rates.append(error_rate)
        
        if success:
            count += 1
        else: 
            
            print(f"Q: {df_val_distance.iloc[i]['conversations'][0]['value']}")
            print(f"A:   {df_val_distance.iloc[i]['conversations'][1]['value']}")
            print(f"P_A: {df_val_pred_distance.iloc[i]['model_answer']}")
            print(f"N_A:   {gt_answer}")
            print(f"N_A_P: {parsed_answer}")
            print(f"ER: {error_rate:.2f}")
    #     lr_pred.append(parsed_answer)

print(f"Accuracy: {count/len(df_val_distance):.5f}% - Error rate: {sum(error_rates)/len(df_val_distance):.5f}")

Q: Give me the distance from the pallet <mask> to the pallet <mask>.
A:   The pallet [Region 0] is 8.66 meters away from the pallet [Region 1].
P_A: The pallet [Region 0] and the pallet [Region 1] are 7.43 meters apart.
N_A:   8.66
N_A_P: 7.43
ER: 0.14
Q: <image>
How close is the pallet <mask> from the pallet <mask>?
A:   The pallet [Region 0] and the pallet [Region 1] are 10.86 meters apart from each other.
P_A: The pallet [Region 0] and the pallet [Region 1] are 9.13 meters apart.
N_A:   10.86
N_A_P: 9.13
ER: 0.16
Q: <image>
Considering the buffer regions <mask> <mask> <mask> and the pallets <mask> <mask> <mask> <mask> <mask> <mask>, how much distance is there between the rightmost pallet and the leftmost buffer region?
A:   From this viewpoint, [Region 8] is the rightmost pallet. [Region 0] is the leftmost buffer region from this viewpoint. The distance from the pallet [Region 8] to the buffer region [Region 0] is 2.49 meters.
P_A: [Region 7] is the rightmost pallet among all the pa

## count

In [100]:
df_val_pred_count = df_val_pred[df_val_pred['category'] == 'count']
df_val_count = df_val[df_val['category']=='count']
pred = []
gt = df_val_distance['normalized_answer']
error_rates = []
count = 0
for i in range(len(df_val_count)):
    if df_val_count.iloc[i]['id'] != df_val_pred_count.iloc[i]['id']:
        print(f"------- > Error <-------")
    else:
        answer = df_val_pred_count.iloc[i]['model_answer']
        category = df_val_pred_count.iloc[i]['category']
        
        parsed_answer = get_parsed_answer(answer, category)
        gt_answer = df_val_count.iloc[i]['normalized_answer']
        lr_pred.append(parsed_answer)

        success = (parsed_answer <= (1.10 * gt_answer)) and (
                    parsed_answer >= (0.90 * gt_answer)
                )
        error_rate = (np.abs(parsed_answer - gt_answer)) / (gt_answer + EPSILON)
        error_rates.append(error_rate)
        
        if success:
            count += 1
        else: 
            
            print(f"Q: {df_val_count.iloc[i]['conversations'][0]['value']}")
            print(f"A:   {df_val_count.iloc[i]['conversations'][1]['value']}")
            print(f"P_A: {df_val_pred_count.iloc[i]['model_answer']}")
            print(f"N_A:   {gt_answer} - N_A_P: {parsed_answer} - ER: {error_rate:.5f}")

print(f"Accuracy: {count/len(df_val_count):.5f}% - Error rate: {sum(error_rates)/len(df_val_count) * 100:.5f}%")

Q: <image>
Using the buffer masks <mask> <mask> <mask> and pallet masks <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, how many pallets are situated in the leftmost buffer zone?
A:   The buffer region [Region 0] is the leftmost buffer region among all the buffer regions. You can find pallets [Region 3] [Region 6] [Region 8] [Region 12] inside the buffer region [Region 0]. Therefore, the buffer region [Region 0] holds a total of four pallets.
P_A: The buffer region [Region 0] is the leftmost buffer region among all the buffer regions. The buffer region [Region 0] contains pallets [Region 3] [Region 6] [Region 12]. Therefore, the buffer region [Region 0] holds a total of three pallets.
N_A:   4 - N_A_P: 3 - ER: 0.25000
Q: <image>
Among the given buffer masks <mask> <mask> <mask> and pallet masks <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, how many pallets are stored in the buffer region closest to the shelf on the left among <mask> <mask

## mcq

In [96]:
df_val_pred_mcq = df_val_pred[df_val_pred['category'] == 'mcq']
df_val_mcq = df_val[df_val['category']=='mcq']
pred = []
gt = df_val_mcq['normalized_answer']
count = 0
wrong_ids = 0
for i in range(len(df_val_mcq)):
    if df_val_count.iloc[i]['id'] != df_val_pred_count.iloc[i]['id']:
        wrong_ids += 1
        print(f"------- > Error <-------")
    else:
        answer = df_val_pred_mcq.iloc[i]['model_answer']
        category = df_val_pred_mcq.iloc[i]['category']
        
        parsed_answer = get_parsed_answer(answer, category)
        gt_answer = int(df_val_mcq.iloc[i]['normalized_answer'])
        
        lr_pred.append(parsed_answer)
        
        if parsed_answer == gt_answer:
            count += 1
        else: 
            print(f"Q: {df_val_mcq.iloc[i]['conversations'][0]['value']}")
            print(f"A:   {df_val_mcq.iloc[i]['conversations'][1]['value']}")
            print(f"P_A: {df_val_pred_mcq.iloc[i]['model_answer']}")
            print(f"N_A:   {gt_answer} - N_A_P: {parsed_answer}")

print(f"Accuracy: {count/len(df_val_mcq):.5f}%")
print(f"Wrong ids: {wrong_ids}")

Q: <image>
Considering the transporters <mask> <mask> <mask> and the pallets <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, which pallet is the optimal choice for an empty transporter to pick up?
A:   Among all the transporters, the transporter [Region 1] is empty. The pallet [Region 4] is the closest to the transporter [Region 1], so it is the most suitable choice to pick up.
P_A: The transporter [Region 1] is not carrying any boxes. The pallet [Region 3] is the closest to transporter [Region 1], so it is the most suitable choice for automated picking.
N_A:   4 - N_A_P: 3
Q: <image>
Considering the transporters <mask> <mask> <mask> and the pallets <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>, which pallet is the optimal choice for an empty transporter to pick up?
A:   At the moment, the transporter [Region 1] is not loaded with any boxes. Given that pallet [Region 3] is the closest to transporter [Region 1], it is the most logical choice for automate

In [84]:
val_pred[0]

{'id': 'aff5479b81c95b0194f58dbaaa041332',
 'category': 'left_right',
 'question': '<image>\nFrom this viewpoint, does the pallet <mask> <depth> appear on the right-hand side of the pallet <mask> <depth>?',
 'model_answer': 'The pallet [Region 0] is positioned to the left of the pallet [Region 1].',
 'parsed_answer': 'left',
 'ground_truth': 'left',
 'is_correct': True}

## Overral

In [90]:
prediction_val = []
for sample in val_pred:
    answer = sample['model_answer']
    category = sample['category']
    parsed_answer = get_parsed_answer(answer, category)

    prediction_val.append({
        'id': sample['id'],
        'normalized_answer': parsed_answer
    })
save_file = "datasets/PhysicalAI-Spatial-Intelligence-Warehouse/outputs/submission_val.json"
with open(save_file, 'w') as f:
    json.dump(prediction_val, f, indent=2)

In [97]:
%%bash
# sanity check with perfect answer
python datasets/PhysicalAI-Spatial-Intelligence-Warehouse/utils/compute_scores.py \
    --gt_path 'datasets/PhysicalAI-Spatial-Intelligence-Warehouse/val.json' \
    --pred_path "datasets/PhysicalAI-Spatial-Intelligence-Warehouse/outputs/submission_val.json"


===== EVALUATION RESULTS =====

QUANTITATIVE RESULTS:
Count (500): 462/500 = 92.40%
  Abs Rel = 0.049
  Error Rate = 4.93%
Distance (486): 273/486 = 56.17%
  Abs Rel = 16948559.815
  Error Rate = 1694855981.52%

QUALITATIVE RESULTS:
left_right (500): 499/500 = 99.80%
mcq (456): 377/456 = 82.68%

===== OVERALL SUMMARY =====
Count (weighted): 92.40% * 0.25 = 23.10
Distance (weighted): 56.17% * 0.25 = 14.04
left_right (weighted): 99.80% * 0.25 = 24.95
mcq (weighted): 82.68% * 0.25 = 20.67

Final Weighted Score: 82.76%
Quantitative: 735/986 = 74.54%
Qualitative: 876/956 = 91.63%
Overall: 1611/1942 = 82.96%

Saved summary results to: datasets/PhysicalAI-Spatial-Intelligence-Warehouse/predictions/score_20250611_114058.json
Saved full results to: datasets/PhysicalAI-Spatial-Intelligence-Warehouse/predictions/full_results_20250611_114058.json


## Get submission for test

In [104]:
test_path = 'datasets/PhysicalAI-Spatial-Intelligence-Warehouse/test.json'
with open(test_path, 'r') as f:
    test = json.load(f)
len(test)

19341

In [103]:
test_pred_path = 'datasets/PhysicalAI-Spatial-Intelligence-Warehouse/outputs/evaluation_test.json'
with open(test_pred_path, 'r') as f:
    test_pred = json.load(f)
len(test_pred)

19341

In [114]:
prediction_test = []
for sample in test_pred:
    answer = sample['model_answer']
    category = sample['category_pred']
    parsed_answer = get_parsed_answer(answer, category)

    prediction_test.append({
        'id': sample['id'],
        'normalized_answer': parsed_answer
    })
print(f"Length of submisstion: {len(prediction_test)}")
save_file = "datasets/PhysicalAI-Spatial-Intelligence-Warehouse/outputs/predictions.json"
with open(save_file, 'w') as f:
    json.dump(prediction_test, f, indent=2)

Length of submisstion: 19341


In [None]:
with open()