In [1]:
import os
import torch
import random
from PIL import Image
import my_prompt5 as my_prompt
from file_managing import (
    load_selected_samples,
    get_actual_path,
    get_gt_path,
)
from config import AGD20K_PATH, model_name
from VLM_model_dot import QwenVLModel, MetricsTracker
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTORCH_ENABLE_SDPA"] = "1"

missing_gt = 0

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


def affordance_grounding(model, action, object_name, image_path, gt_path, exo_path=None,  failed_heatmap_path=None, validation_reason=None):
    """
    Process each image using Qwen VL model
    """
    print(f"Processing image: Action: {action}, Object: {object_name}, Image path: {image_path}, GT path: {gt_path}, Image exists: {os.path.exists(image_path)}, GT exists: {os.path.exists(gt_path)}")
    

    if exo_path is None:
        prompt = my_prompt.process_image_ego_prompt(action, object_name)
               
        results = model.process_image_ego(image_path, prompt, gt_path, action)

        
    else:

        prompt = my_prompt.process_image_exo_prompt(action, object_name)
        results = model.process_image_exo(image_path, prompt, gt_path, exo_path, action)

    return results

In [3]:
    # Initialize Qwen VL model
    model = QwenVLModel(model_name = model_name)
    metrics_tracker_ego = MetricsTracker(name="only_ego")
    metrics_tracker_exo_best = MetricsTracker(name="with_exo_best")

    json_path = os.path.join("selected_samples.json")
    data = load_selected_samples(json_path)

    # Get total number of samples
    total_samples = len(data['selected_samples'])
    
    # Process each sample
    print(f"Processing {total_samples} samples...")
    print("=" * 50)    

💻 사용 디바이스: cuda
🤖 Qwen/Qwen2.5-VL-32B-Instruct 모델 로딩중...


Loading checkpoint shards: 100%|██████████| 18/18 [00:08<00:00,  2.16it/s]

✅ 모델 로딩 완료!
Processing 123 samples...





In [57]:
for pair_key, sample_info in data["selected_samples"].items():
    if( sample_info['object'] =='skis') and (  sample_info['action'] =='jump'):
        print(pair_key, sample_info )
        break

jump_skis {'action': 'jump', 'object': 'skis', 'image_path': '${AGD20K_PATH}/Seen/testset/egocentric/jump/skis/skis_002829.jpg'}


In [6]:
action = sample_info["action"]
object_name = sample_info["object"]
my_prompt.process_image_ego_prompt_w_pred(action, object_name)

"\n        You are given an image showing a 'skis' involved in the action 'jump'.\n\n        🎯 Your task:\n        Identify several **precise keypoints** in the image that are essential for performing the action 'jump' on the object 'skis'.\n        and rating the importance of that point\n\n        ⚠️ Important Instructions:\n        - Only return **single-point** coordinates and prediction score in the format [x, y, prediction_score]\n        - Do **not** return bounding boxes or regions\n        - All points must lie **within** the 'skis'\n        - Avoid placing multiple points too close together\n        - If there are more than one 'skis', give me point from each 'skis'\n        - ❌ Do **not** include any text, comments, or labels\n\n        ✅ Output format (strict):\n        [\n        [x1, y1,prediction_score],\n        [x2, y2,prediction_score],\n        [x3, y3,prediction_score]\n        ]\n        "

In [7]:

action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")


# Process the image
prompt = my_prompt.process_image_ego_prompt_w_pred(action, object_name)
        
results = model.process_image_ego(image_path, prompt, gt_path, action)
results



Action : jump, Object : skis image_name : skis_002829.jpg


qwen ego Results!! : [
    [150, 700, 0.8],  // Midpoint of the left ski, important for balance during the jump.
    [400, 700, 0.8],  // Midpoint of the right ski, important for balance during the jump.
    [150, 100, 0.6],  // Tip of the left ski, crucial for initiating the jump.
    [400, 100, 0.6],  // Tip of the right ski, crucial for initiating the jump.
    [150, 1300, 0.5], // Tail of the left ski, important for control and landing.
    [400, 1300, 0.5]  // Tail of the right ski, important for control and landing.
]
final points :[[150, 700], [400, 700], [150, 100], [400, 100], [150, 1300], [400, 1300]]
parsed dots!!! : [[150, 700], [400, 700], [150, 100], [400, 100], [150, 1300], [400, 1300]]


{'text_result': '[\n    [150, 700, 0.8],  // Midpoint of the left ski, important for balance during the jump.\n    [400, 700, 0.8],  // Midpoint of the right ski, important for balance during the jump.\n    [150, 100, 0.6],  // Tip of the left ski, crucial for initiating the jump.\n    [400, 100, 0.6],  // Tip of the right ski, crucial for initiating the jump.\n    [150, 1300, 0.5], // Tail of the left ski, important for control and landing.\n    [400, 1300, 0.5]  // Tail of the right ski, important for control and landing.\n]',
 'dots': [[150, 700],
  [400, 700],
  [150, 100],
  [400, 100],
  [150, 1300],
  [400, 1300]],
 'dot_image_path': '/root/qwen_AG_new/dot_images/only_ego/skis_002829_jump.jpg',
 'dot_only_image_path': '/root/qwen_AG_new/dot_images/dots_only/skis_002829_jump_dots.jpg',
 'heatmap_image_path': '/root/qwen_AG_new/dot_images/heatmaps/skis_002829_jump_heatmap.jpg',
 'heatmap_tensor': tensor([[0.1622, 0.1649, 0.1676,  ..., 0.1414, 0.1389, 0.1365],
         [0.1640, 0.1

In [64]:

action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")


# Process the image
prompt = my_prompt.process_image_exo_prompt(action, object_name)
prompt    


Action : jump, Object : skis image_name : skis_002829.jpg


"\n    You are given two images:\n    1. An **egocentric** image where you must select keypoints.\n    2. An **exocentric** reference image showing how the action 'jump' is typically performed on the 'skis'.\n\n    🎯 Task:\n    Select multiple [x, y] keypoints in the **egocentric image** that are critical for performing the action 'jump' on the 'skis'.\n\n    🔍 Use the exocentric image to:\n    - Understand typical interaction patterns\n    - Identify functionally important parts (e.g., contact or force areas)\n\n    📌 Guidelines:\n    - Keypoints must lie **within** the 'skis' in the egocentric image\n    - If there are multiple 'skis' instances, mark keypoints on **each of them**\n    - Place **at least 3 well-separated** points covering the entire functional region\n    - e.g., for a handle: both ends and the center\n    - Avoid clustering or irrelevant placements\n\n    ⛔ Do NOT:\n    - Include text, labels, bounding boxes, or extra formatting\n\n    ✅ Output format (strict):\n    

In [72]:
exo_path = "/root/AGD20K/Seen/trainset/exocentric/jump/skis/jump_skis_004492.jpg"
results = model.process_image_exo(image_path, prompt,gt_path, exo_path, action)
results


exo file name : jump_skis_004492.jpg / exo_path


final points :[[140, 1060], [180, 1060], [375, 1060], [415, 1060]]


{'text_result': 'To perform the action "jump" on skis, the key functional regions on the skis are typically near the bindings, which secure the boots and allow for control during jumps. Based on the provided images:\n\n- The **egocentric image** shows a pair of skis with visible bindings near the middle section.\n- The **exocentric image** demonstrates people skiing, indicating that the bindings are crucial for jumping.\n\nHere are the selected keypoints on the skis from the egocentric image:\n\n```json\n[\n    [140, 1060],  // Near the binding area on the left ski\n    [180, 1060],  // Near the binding area on the left ski\n    [375, 1060],  // Near the binding area on the right ski\n    [415, 1060]   // Near the binding area on the right ski\n]\n```\n\nThese points are strategically placed near the bindings, which are essential for securing the boots and providing control during a jump. They are well-separated and cover the functional region effectively.',
 'dots': [[140, 1060], [180

In [74]:
print(results['text_result'])

To perform the action "jump" on skis, the key functional regions on the skis are typically near the bindings, which secure the boots and allow for control during jumps. Based on the provided images:

- The **egocentric image** shows a pair of skis with visible bindings near the middle section.
- The **exocentric image** demonstrates people skiing, indicating that the bindings are crucial for jumping.

Here are the selected keypoints on the skis from the egocentric image:

```json
[
    [140, 1060],  // Near the binding area on the left ski
    [180, 1060],  // Near the binding area on the left ski
    [375, 1060],  // Near the binding area on the right ski
    [415, 1060]   // Near the binding area on the right ski
]
```

These points are strategically placed near the bindings, which are essential for securing the boots and providing control during a jump. They are well-separated and cover the functional region effectively.


In [None]:
question = f'''

When a person {action} on {object_name}, which parts of the {object_name} could be the points of contact with a human body?
Please give me a score for each point. score from 0~10.
if dots are incorrect, then say "WRONG"

[
    [140, 1060],  
    [180, 1060],  
    [375, 1060],  
    [415, 1060]   
]

✅ Output format (strict):
[
[x1, y1,prediction_score],
[x2, y2,prediction_score],
[x3, y3,prediction_score]
]

'''

prediction_score = model.ask(question = question) 
prediction_score

"To determine the points of contact between a person and skis when jumping, we need to consider the typical anatomy of a ski and how a person interacts with it during a jump. The key areas of contact are usually:\n\n1. **The bindings**: These are the mechanisms that secure the boots to the skis. They are located near the center of the skis.\n2. **The tips and tails of the skis**: While these areas are less likely to be in direct contact with the body, they can come into play if the skier lands awkwardly or if the skis are not properly aligned.\n\nGiven the coordinates provided, let's analyze them:\n\n- **[140, 1060]**: This is likely near the front binding area, where the toe of the boot is secured. This is a high-probability contact point.\n- **[180, 1060]**: This is also near the binding area but slightly further back. It is still a high-probability contact point.\n- **[375, 1060]**: This is closer to the tail of the ski, which is less likely to be in direct contact with the body dur

In [77]:
print(prediction_score)

To determine the points of contact between a person and skis when jumping, we need to consider the typical anatomy of a ski and how a person interacts with it during a jump. The key areas of contact are usually:

1. **The bindings**: These are the mechanisms that secure the boots to the skis. They are located near the center of the skis.
2. **The tips and tails of the skis**: While these areas are less likely to be in direct contact with the body, they can come into play if the skier lands awkwardly or if the skis are not properly aligned.

Given the coordinates provided, let's analyze them:

- **[140, 1060]**: This is likely near the front binding area, where the toe of the boot is secured. This is a high-probability contact point.
- **[180, 1060]**: This is also near the binding area but slightly further back. It is still a high-probability contact point.
- **[375, 1060]**: This is closer to the tail of the ski, which is less likely to be in direct contact with the body during a jump

In [78]:
image_path = get_actual_path(sample_info["image_path"])
image_path

'/root/AGD20K/Seen/testset/egocentric/jump/skis/skis_002829.jpg'

In [None]:
image_path = get_actual_path(sample_info["image_path"])
image_path

question = f'''

When a person {action} on {object_name}, which parts of the {object_name} could be the points of contact with a human body?
Please give me a score for each point. score from 0~10.

[
    [140, 1060],  
    [180, 1060],  
    [375, 1060],  
    [415, 1060]   
]

✅ Output format (strict):
[
[x1, y1,prediction_score],
[x2, y2,prediction_score],
[x3, y3,prediction_score]
]

'''


prediction_score = model.ask_with_image(question = question, image_path = image_path) 
prediction_score

"To determine the points of contact between a human body and skis when jumping, we need to consider the typical areas where a skier's body might touch the skis during such an action. Generally, these points would be near the bindings or the central area of the skis, as these are the regions where the boots are attached and the skier's weight is distributed.\n\n### Analysis of the Provided Points:\n1. **[140, 1060]**: This point is located near the center of the ski, close to the bindings area. It is a plausible point of contact because this is where the skier's boots are typically attached, and it is a common area for pressure distribution.\n2. **[180, 1060]**: This point is also near the center but slightly further toward the tail of the ski. While it is still within the central region, it is less likely to be a primary point of contact compared to the bindings area.\n3. **[375, 1060]**: Similar to [140, 1060], this point is near the center of the other ski, close to the bindings area

In [85]:
print(prediction_score)

To determine the points of contact between a human body and skis when jumping, we need to consider the typical areas where a skier's body might touch the skis during such an action. Generally, these points would be near the bindings or the central area of the skis, as these are the regions where the boots are attached and the skier's weight is distributed.

### Analysis of the Provided Points:
1. **[140, 1060]**: This point is located near the center of the ski, close to the bindings area. It is a plausible point of contact because this is where the skier's boots are typically attached, and it is a common area for pressure distribution.
2. **[180, 1060]**: This point is also near the center but slightly further toward the tail of the ski. While it is still within the central region, it is less likely to be a primary point of contact compared to the bindings area.
3. **[375, 1060]**: Similar to [140, 1060], this point is near the center of the other ski, close to the bindings area. It i

In [None]:
question = f'''

When a person {action} on {object_name}, which parts of the {object_name} could be the points of contact with a human body?
Please give me a score for each point. score from 0~10.

[
    [140, 150], 
    [150, 700], 
    [160, 1200],
    [380, 150], 
    [390, 700], 
    [400, 1200] ,
]

✅ Output format (strict):
[
[x1, y1,prediction_score],
[x2, y2,prediction_score],
[x3, y3,prediction_score]
]

'''


prediction_score = model.ask_with_image(question = question, image_path = image_path) 
prediction_score

'To determine the points of contact between a human body and skis when jumping, we need to consider the typical posture and movement during a ski jump. Generally, the points of contact would be near the bindings, where the boots are attached to the skis. These areas are closer to the middle of the skis rather than the tips or tails.\n\n### Analysis of the Provided Points:\n1. **[140, 150]**: This point is near the tip of the left ski. The tip is unlikely to be a primary point of contact during a jump.\n2. **[150, 700]**: This point is near the middle of the left ski, close to where the bindings are typically located. This is a likely point of contact.\n3. **[160, 1200]**: This point is near the tail of the left ski. The tail is also unlikely to be a primary point of contact.\n4. **[380, 150]**: This point is near the tip of the right ski. Similar to the left ski, the tip is not a likely point of contact.\n5. **[390, 700]**: This point is near the middle of the right ski, close to where

In [48]:
print(prediction_score)

To determine the points of contact between a human body and skis when jumping, we need to consider the typical posture and movement during a ski jump. Generally, the points of contact would be near the bindings, where the boots are attached to the skis. These areas are closer to the middle of the skis rather than the tips or tails.

### Analysis of the Provided Points:
1. **[140, 150]**: This point is near the tip of the left ski. The tip is unlikely to be a primary point of contact during a jump.
2. **[150, 700]**: This point is near the middle of the left ski, close to where the bindings are typically located. This is a likely point of contact.
3. **[160, 1200]**: This point is near the tail of the left ski. The tail is also unlikely to be a primary point of contact.
4. **[380, 150]**: This point is near the tip of the right ski. Similar to the left ski, the tip is not a likely point of contact.
5. **[390, 700]**: This point is near the middle of the right ski, close to where the bin

In [45]:
print(prediction_score)

To determine the points of contact between a human body and skis during a jump, we need to consider the typical posture and movement of a skier. During a jump, the skier's body is often positioned in a way that minimizes air resistance and maximizes control. The skis are typically held parallel to the ground, and the skier's hands or feet may come into contact with the skis.

### Analysis:
1. **Front Section of the Left Ski ([140, 150]):**
   - This area is near the tip of the ski. It is unlikely to be a primary point of contact because the skier's body is usually positioned further back for balance.
   - **Score:** 2

2. **Middle Section of the Left Ski ([150, 700]):**
   - This area is closer to the center of the ski, where the skier's hands or feet might touch the ski for balance or control.
   - **Score:** 7

3. **Back Section of the Left Ski ([160, 1200]):**
   - This area is near the tail of the ski. It is a common point of contact because the skier's hands or feet may rest here 

In [4]:
for pair_key, sample_info in data["selected_samples"].items():
    if( sample_info['object'] =='bicycle') and (  sample_info['action'] =='push'):
        print(pair_key, sample_info )
        break

push_bicycle {'action': 'push', 'object': 'bicycle', 'image_path': '${AGD20K_PATH}/Seen/testset/egocentric/push/bicycle/bicycle_002432.jpg'}


In [16]:

action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")


# Process the image
prompt = my_prompt.process_image_ego_prompt(action, object_name)
        
results = model.process_image_ego(image_path, prompt, gt_path, action)
results



Action : push, Object : bicycle image_name : bicycle_002432.jpg


qwen ego Results!! : [
    [450, 308],  // Rear wheel contact point with ground
    [196, 270],  // Frame near the center of gravity
    [100, 220]   // Front wheel contact point with ground
]
final points :[[450, 308], [196, 270], [100, 220]]
parsed dots!!! : [[450, 308], [196, 270], [100, 220]]


{'text_result': '[\n    [450, 308],  // Rear wheel contact point with ground\n    [196, 270],  // Frame near the center of gravity\n    [100, 220]   // Front wheel contact point with ground\n]',
 'dots': [[450, 308], [196, 270], [100, 220]],
 'dot_image_path': '/root/qwen_AG_new/dot_images/only_ego/bicycle_002432_push.jpg',
 'dot_only_image_path': '/root/qwen_AG_new/dot_images/dots_only/bicycle_002432_push_dots.jpg',
 'heatmap_image_path': '/root/qwen_AG_new/dot_images/heatmaps/bicycle_002432_push_heatmap.jpg',
 'heatmap_tensor': tensor([[4.6748e-06, 4.8726e-06, 5.0766e-06,  ..., 2.3473e-13, 1.1308e-13,
          0.0000e+00],
         [5.1223e-06, 5.3390e-06, 5.5626e-06,  ..., 4.5598e-13, 3.1772e-13,
          1.8918e-13],
         [5.6103e-06, 5.8477e-06, 6.0926e-06,  ..., 7.0659e-13, 5.4950e-13,
          4.0347e-13],
         ...,
         [2.3189e-05, 2.4268e-05, 2.5390e-05,  ..., 3.9155e-05, 3.6214e-05,
          3.3479e-05],
         [2.1349e-05, 2.2344e-05, 2.3378e-05,  ..., 3.7

In [6]:

action = sample_info["action"]
object_name = sample_info["object"]

image_path = get_actual_path(sample_info["image_path"])
gt_path = get_gt_path(image_path)    
print(f"Action : {action}, Object : {object_name} image_name : {image_path.split('/')[-1]}")


# Process the image
prompt = my_prompt.process_image_exo_prompt_w_pred(action, object_name)
prompt    



Action : push, Object : bicycle image_name : bicycle_002432.jpg


"\n    You are given two images:\n    1. An **egocentric** image where you must select keypoints.\n    2. An **exocentric** reference image showing how the action 'push' is typically performed on the 'bicycle'.\n\n    🎯 Your task:\n    Identify several **precise keypoints** in the image that are essential for performing the action 'push' on the object 'bicycle'.\n    and rating the importance of that point\n\n    🔍 Use the exocentric image to:\n    - Understand typical interaction patterns\n    - Identify functionally important parts (e.g., contact or force areas)\n\n    📌 Guidelines:\n    - Only return **single-point** coordinates and prediction score in the format [x, y, prediction_score]\n    - Do **not** return bounding boxes or regions\n    - All points must lie **within** the 'bicycle'\n    - Avoid placing multiple points too close together\n    - If there are more than one 'bicycle', give me point from each 'bicycle'\n\n    ⛔ Do NOT:\n    - Include text, labels, bounding boxes, 

In [14]:
exo_path = "/root/AGD20K/Seen/trainset/exocentric/push/bicycle/push_bicycle_010074.jpg"
results = model.process_image_exo(image_path, prompt,gt_path, exo_path, action)
results


exo file name : push_bicycle_010074.jpg / exo_path


final points :[[340, 205], [270, 290], [160, 340]]


{'text_result': '[\n    [340, 205, 0.8],  # Handlebar area for pushing\n    [270, 290, 0.7],  # Frame near the seat for pushing\n    [160, 340, 0.6]   # Rear wheel area for pushing\n]',
 'dots': [[340, 205], [270, 290], [160, 340]],
 'dot_image_path': '/root/qwen_AG_new/dot_images/with_exo/bicycle_002432_push_exo_push_bicycle_010074.jpg',
 'dot_only_image_path': '/root/qwen_AG_new/dot_images/dots_only/bicycle_002432_push_dots_exo.jpg',
 'heatmap_image_path': '/root/qwen_AG_new/dot_images/heatmaps/bicycle_002432_push_heatmap_exo_reference_push_bicycle_010074.jpg',
 'heatmap_tensor': tensor([[0.0000e+00, 1.1686e-14, 2.4192e-14,  ..., 1.3084e-12, 1.1373e-12,
          9.8563e-13],
         [2.3882e-14, 3.7280e-14, 5.1614e-14,  ..., 1.4390e-12, 1.2527e-12,
          1.0875e-12],
         [5.1258e-14, 6.6616e-14, 8.3043e-14,  ..., 1.5805e-12, 1.3776e-12,
          1.1978e-12],
         ...,
         [1.0162e-03, 1.0860e-03, 1.1601e-03,  ..., 3.2173e-13, 2.6520e-13,
          2.1516e-13],
  

In [15]:
print(results['text_result'])

[
    [340, 205, 0.8],  # Handlebar area for pushing
    [270, 290, 0.7],  # Frame near the seat for pushing
    [160, 340, 0.6]   # Rear wheel area for pushing
]
