In [1]:
import os
import torch
import random
from PIL import Image
import my_prompt5 as my_prompt
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
)
from config import AGD20K_PATH, model_name
from VLM_model_dot import QwenVLModel, MetricsTracker
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTORCH_ENABLE_SDPA"] = "1"

missing_gt = 0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


def affordance_grounding(model, action, object_name, image_path, gt_path, exo_path=None,  failed_heatmap_path=None, validation_reason=None):
    """
    Process each image using Qwen VL model
    """
    print(f"Processing image: Action: {action}, Object: {object_name}, Image path: {image_path}, GT path: {gt_path}, Image exists: {os.path.exists(image_path)}, GT exists: {os.path.exists(gt_path)}")
    

    if exo_path is None:
        prompt = my_prompt.process_image_ego_prompt(action, object_name)
               
        results = model.process_image_ego(image_path, prompt, gt_path, action)

        
    else:

        prompt = my_prompt.process_image_exo_prompt(action, object_name)
        results = model.process_image_exo(image_path, prompt, gt_path, exo_path, action)

    return results

In [3]:
    # Initialize Qwen VL model
    model = QwenVLModel(model_name = model_name)
    metrics_tracker_ego = MetricsTracker(name="only_ego")
    metrics_tracker_exo_best = MetricsTracker(name="with_exo_best")

    json_path = os.path.join("selected_samples.json")
    data = load_selected_samples(json_path)

    # Get total number of samples
    total_samples = len(data['selected_samples'])
    
    # Process each sample
    print(f"Processing {total_samples} samples...")
    print("=" * 50)    

💻 사용 디바이스: cuda
🤖 Qwen/Qwen2.5-VL-3B-Instruct 모델 로딩중...


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.12it/s]


✅ 모델 로딩 완료!
Processing 123 samples...


In [4]:
for pair_key, sample_info in data["selected_samples"].items():
    if( sample_info['object'] =='skis') and (  sample_info['action'] =='jump'):
        print(pair_key, sample_info )
        break

jump_skis {'action': 'jump', 'object': 'skis', 'image_path': '${AGD20K_PATH}/Seen/testset/egocentric/jump/skis/skis_002829.jpg'}


In [6]:
action = sample_info["action"]
object_name = sample_info["object"]
my_prompt.process_image_ego_prompt_w_pred(action, object_name)

"\n        You are given an image showing a 'skis' involved in the action 'jump'.\n\n        🎯 Your task:\n        Identify several **precise keypoints** in the image that are essential for performing the action 'jump' on the object 'skis'.\n        and rating the importance of that point\n\n        ⚠️ Important Instructions:\n        - Only return **single-point** coordinates and prediction score in the format [x, y, prediction_score]\n        - Do **not** return bounding boxes or regions\n        - All points must lie **within** the 'skis'\n        - Avoid placing multiple points too close together\n        - If there are more than one 'skis', give me point from each 'skis'\n        - ❌ Do **not** include any text, comments, or labels\n\n        ✅ Output format (strict):\n        [\n        [x1, y1,prediction_score],\n        [x2, y2,prediction_score],\n        [x3, y3,prediction_score]\n        ]\n        "

In [8]:
map(int,[1,3])

<map at 0x7efccd3353d0>

In [7]:

action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")


# Process the image
prompt = my_prompt.process_image_ego_prompt_w_pred(action, object_name)
        
results = model.process_image_ego(image_path, prompt, gt_path, action)
results



Action : jump, Object : skis image_name : skis_002829.jpg
qwen ego Results!! : [
    [105, 14, 210, 1406, 0.9],
    [340, 14, 455, 1406, 0.9]
]
No dot coordinates found, trying to parse as bounding boxes...
text : [
    [105, 14, 210, 1406, 0.9],
    [340, 14, 455, 1406, 0.9]
]
final points :[]
parsed dots!!! : []


{'text_result': '[\n    [105, 14, 210, 1406, 0.9],\n    [340, 14, 455, 1406, 0.9]\n]',
 'dots': [],
 'dot_image_path': '/home/bongo/porter_notebook/research/new_qwen_AG/dot_images/only_ego/skis_002829_jump.jpg',
 'dot_only_image_path': '/home/bongo/porter_notebook/research/new_qwen_AG/dot_images/dots_only/skis_002829_jump_dots.jpg',
 'heatmap_image_path': '/home/bongo/porter_notebook/research/new_qwen_AG/dot_images/heatmaps/skis_002829_jump_heatmap.jpg',
 'heatmap_tensor': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'metrics': None}

In [7]:
action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")
exo_best_path = "dogs.jpg"
if  (exo_best_path is None):
    print(f"NO SEEN DATA SET : {action}/{object_name}")

# Process the image
results_ego = affordance_grounding(model, action, object_name, image_path, gt_path)
metrics_ego = results_ego['metrics']
if metrics_ego:
    # Update and print metrics
    metrics_tracker_ego.update(metrics_ego)
    metrics_tracker_ego.print_metrics(metrics_ego, image_path.split('/')[-1])
    
# with exo random
results_exo_best = affordance_grounding(model, action, object_name, image_path, gt_path, exo_best_path)     
metrics_exo_best = results_exo_best['metrics']

if metrics_exo_best:
    metrics_tracker_exo_best.update(metrics_exo_best)
    metrics_tracker_exo_best.print_metrics(metrics_exo_best, image_path.split('/')[-1])
    
    
# Count missing GT files
if not os.path.exists(gt_path):
    missing_gt += 1

print("*** End  ", "*"*150)
print("\n\n")

Action : jump, Object : skis image_name : skis_002829.jpg
Processing image: Action: jump, Object: skis, Image path: /home/DATA/AGD20K/Seen/testset/egocentric/jump/skis/skis_002829.jpg, GT path: /home/DATA/AGD20K/Seen/testset/GT/jump/skis/skis_002829.png, Image exists: True, GT exists: True
qwen ego Results!! : [
    [105, 14], 
    [105, 1368], 
    [347, 14], 
    [347, 1368]
]
parsed dots!!! : [[105, 14], [105, 1368], [347, 14], [347, 1368]]

Metrics for only_ego skis_002829.jpg:
 only_ego Current - KLD: 11.5107 | SIM: 0.0000 | NSS: -0.6241

Cumulative only_ego  Averages over 1 samples:
Average - KLD: 11.5107 | SIM: 0.0000 | NSS: -0.6241

Processing image: Action: jump, Object: skis, Image path: /home/DATA/AGD20K/Seen/testset/egocentric/jump/skis/skis_002829.jpg, GT path: /home/DATA/AGD20K/Seen/testset/GT/jump/skis/skis_002829.png, Image exists: True, GT exists: True
exo file name : dogs.jpg / exo_path

Metrics for with_exo_best skis_002829.jpg:
 with_exo_best Current - KLD: 11.5114 